diff --git a/collector-service/app/__init__.py b/collector-service/app/__init__.py new file mode 100644 index 0000000..7bcd463 --- /dev/null +++ b/collector-service/app/__init__.py @@ -0,0 +1 @@ +"""Collector service source overlay for legacy pyc-backed app.""" diff --git a/collector-service/app/core_main.py b/collector-service/app/core_main.py new file mode 100644 index 0000000..bd3e93f --- /dev/null +++ b/collector-service/app/core_main.py @@ -0,0 +1,2952 @@ +from __future__ import annotations + +import asyncio +import httpx +import json +import os +import re +import secrets +import shutil +import socket +import subprocess +import uuid +from datetime import datetime, timezone +from pathlib import Path +from typing import Any +from urllib.parse import urljoin, urlparse + +from fastapi import Body, Depends, FastAPI, File, Form, Header, HTTPException, Query, UploadFile +from fastapi.middleware.cors import CORSMiddleware +from fastapi.staticfiles import StaticFiles +from pydantic import BaseModel, Field + +from .database import Database, utc_now +from .integrations import AsrHttpClient, CutVideoClient, HuobaoDramaClient, N8NClient +from .openai_compat import OpenAICompatClient + +BASE_DIR = Path(__file__).resolve().parents[2] +DATA_DIR = Path(os.getenv("DATA_DIR", BASE_DIR / "data" / "collector")) +DOWNLOADS_DIR = DATA_DIR / "downloads" +JOBS_DIR = DATA_DIR / "jobs" +MODELS_DIR = DATA_DIR / "models" +DB_PATH = os.getenv("DATABASE_PATH", str(DATA_DIR / "storyforge.db")) +DEFAULT_EXTERNAL_BASE_URL = os.getenv("DEFAULT_EXTERNAL_BASE_URL", "https://test.hyzq.net/storyforge") +LOCAL_OPENAI_BASE_URL = os.getenv("LOCAL_OPENAI_BASE_URL", "http://127.0.0.1:8317/v1") +LOCAL_OPENAI_MODEL = os.getenv("LOCAL_OPENAI_MODEL", "GLM-5") +LOCAL_OPENAI_API_KEY = os.getenv("LOCAL_OPENAI_API_KEY", "") +YTDLP_BIN = os.getenv("YTDLP_BIN", "yt-dlp") +FFMPEG_BIN = os.getenv("FFMPEG_BIN", "ffmpeg") +WHISPER_BIN = os.getenv("WHISPER_BIN", "") +WHISPER_MODEL = os.getenv("WHISPER_MODEL", str(MODELS_DIR / "ggml-base.en.bin")) +ASR_HTTP_BASE_URL = os.getenv("ASR_HTTP_BASE_URL", "http://127.0.0.1:8088") +ASR_HTTP_TRANSCRIBE_PATH = os.getenv("ASR_HTTP_TRANSCRIBE_PATH", "/transcribe") +ASR_HTTP_FIELD_NAME = os.getenv("ASR_HTTP_FIELD_NAME", "wav") +ASR_HTTP_TIMEOUT_SEC = float(os.getenv("ASR_HTTP_TIMEOUT_SEC", "120")) +N8N_BASE_URL = os.getenv("N8N_BASE_URL", "http://127.0.0.1:5670") +N8N_ANALYSIS_WEBHOOK_PATH = os.getenv("N8N_ANALYSIS_WEBHOOK_PATH", "/webhook/storyforge-analysis") +N8N_REAL_CUT_WEBHOOK_PATH = os.getenv("N8N_REAL_CUT_WEBHOOK_PATH", "/webhook/storyforge-real-cut") +N8N_AI_VIDEO_WEBHOOK_PATH = os.getenv("N8N_AI_VIDEO_WEBHOOK_PATH", "/webhook/storyforge-ai-video") +N8N_CONTENT_SOURCE_SYNC_WEBHOOK_PATH = os.getenv("N8N_CONTENT_SOURCE_SYNC_WEBHOOK_PATH", "/webhook/storyforge-content-source-sync") +ORCHESTRATOR_SHARED_SECRET = os.getenv("ORCHESTRATOR_SHARED_SECRET", "") +CUTVIDEO_BASE_URL = os.getenv("CUTVIDEO_BASE_URL", "http://192.168.31.18:7860") +CUTVIDEO_API_KEY = os.getenv("CUTVIDEO_API_KEY", "") +HUOBAO_BASE_URL = os.getenv("HUOBAO_BASE_URL", "http://127.0.0.1:5678") +CUTVIDEO_BASE_CONFIG = os.getenv("CUTVIDEO_BASE_CONFIG", "example.job.yaml") +CUTVIDEO_POLL_INTERVAL_SEC = int(os.getenv("CUTVIDEO_POLL_INTERVAL_SEC", "10")) +CUTVIDEO_MAX_WAIT_SEC = int(os.getenv("CUTVIDEO_MAX_WAIT_SEC", "1800")) +CUTVIDEO_UPLOAD_TIMEOUT_SEC = int(os.getenv("CUTVIDEO_UPLOAD_TIMEOUT_SEC", "1800")) +HUOBAO_POLL_INTERVAL_SEC = int(os.getenv("HUOBAO_POLL_INTERVAL_SEC", "10")) +HUOBAO_MAX_WAIT_SEC = int(os.getenv("HUOBAO_MAX_WAIT_SEC", "900")) + +for path in (DATA_DIR, DOWNLOADS_DIR, JOBS_DIR, MODELS_DIR): + path.mkdir(parents=True, exist_ok=True) + +db = Database(DB_PATH) +openai_client = OpenAICompatClient() +asr_http_client = AsrHttpClient( + base_url=ASR_HTTP_BASE_URL, + transcribe_path=ASR_HTTP_TRANSCRIBE_PATH, + field_name=ASR_HTTP_FIELD_NAME, + timeout=ASR_HTTP_TIMEOUT_SEC, +) +n8n_client = N8NClient( + base_url=N8N_BASE_URL, + workflow_paths={ + "analysis_pipeline": N8N_ANALYSIS_WEBHOOK_PATH, + "real_cut_pipeline": N8N_REAL_CUT_WEBHOOK_PATH, + "ai_video_pipeline": N8N_AI_VIDEO_WEBHOOK_PATH, + "content_source_sync_pipeline": N8N_CONTENT_SOURCE_SYNC_WEBHOOK_PATH, + }, + shared_secret=ORCHESTRATOR_SHARED_SECRET, +) +cutvideo_client = CutVideoClient( + base_url=CUTVIDEO_BASE_URL, + api_key=CUTVIDEO_API_KEY, + upload_timeout=CUTVIDEO_UPLOAD_TIMEOUT_SEC, +) +huobao_client = HuobaoDramaClient(base_url=HUOBAO_BASE_URL) + +app = FastAPI(title="StoryForge Collector Service", version="0.2.0") +app.add_middleware( + CORSMiddleware, + allow_origins=["*"], + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], +) +app.mount("/downloads", StaticFiles(directory=str(DOWNLOADS_DIR)), name="downloads") + + +class RegisterAccountRequest(BaseModel): + username: str + password: str + display_name: str = "" + + +class LoginRequest(BaseModel): + username: str + password: str + + +class ModelProfileRequest(BaseModel): + name: str + base_url: str + api_key: str = "" + model_name: str + is_default: bool = False + + +class PreferredModelRequest(BaseModel): + model_profile_id: str + + +class KnowledgeBaseCreateRequest(BaseModel): + name: str + description: str = "" + project_id: str = "" + + +class ExploreVideoLinkRequest(BaseModel): + video_url: str + title: str | None = None + project_id: str | None = None + knowledge_base_id: str | None = None + assistant_id: str | None = None + analysis_model_profile_id: str | None = None + language: str = "auto" + + +class ExploreTextRequest(BaseModel): + title: str + content: str + project_id: str | None = None + knowledge_base_id: str | None = None + assistant_id: str | None = None + analysis_model_profile_id: str | None = None + + +class AssistantCreateRequest(BaseModel): + name: str + description: str = "" + system_prompt: str = "" + generation_goal: str = "" + knowledge_base_ids: list[str] = Field(default_factory=list) + project_id: str = "" + model_profile_id: str = "" + + +class AssistantUpdateRequest(BaseModel): + name: str | None = None + description: str | None = None + system_prompt: str | None = None + generation_goal: str | None = None + knowledge_base_ids: list[str] | None = None + project_id: str | None = None + model_profile_id: str | None = None + + +class GenerateCopyRequest(BaseModel): + brief: str + platform: str = "抖音" + audience: str = "创业者" + extra_requirements: str = "" + knowledge_base_ids: list[str] = Field(default_factory=list) + + +class PublishAppUpdateRequest(BaseModel): + platform: str = "android" + channel: str = "stable" + versionCode: int + versionName: str + minSupportedCode: int + apkUrl: str + apkSha256: str = "" + notes: str = "" + forceUpdate: bool = False + isActive: bool = True + + +class ProjectCreateRequest(BaseModel): + name: str + description: str = "" + + +class ContentSourceCreateRequest(BaseModel): + project_id: str = "" + source_kind: str + platform: str = "" + handle: str = "" + source_url: str = "" + title: str = "" + local_path: str = "" + metadata: dict[str, Any] = Field(default_factory=dict) + + +class ContentSourceSyncRequest(BaseModel): + project_id: str = "" + knowledge_base_id: str = "" + assistant_id: str = "" + content_source_id: str = "" + platform: str = "" + handle: str = "" + source_url: str = "" + title: str = "" + analysis_model_profile_id: str = "" + language: str = "auto" + max_items: int = Field(default=5, ge=1, le=20) + skip_existing: bool = True + auto_trigger_analysis: bool = True + + +class RealCutJobRequest(BaseModel): + project_id: str = "" + title: str + input_dir: str = "" + source_job_id: str = "" + base_config: str = "" + objective: str = "保留高信息密度片段,输出适合短视频平台的粗剪结果" + target_duration_sec: int = 60 + target_aspect_ratio: str = "9:16" + ideal_segment_duration_sec: int = 8 + max_segment_duration_sec: int = 18 + transcript_backend: str = "auto" + transcript_device: str = "cuda" + review_enabled: bool = False + dry_run: bool = False + + +class AiVideoJobRequest(BaseModel): + project_id: str = "" + assistant_id: str = "" + knowledge_base_id: str = "" + source_job_id: str = "" + title: str + brief: str + style: str = "realistic" + shots: int = 4 + image_provider: str = "openai" + image_model: str = "" + video_provider: str = "doubao" + video_model: str = "" + aspect_ratio: str = "9:16" + duration: int = 5 + + +class InternalStepRequest(BaseModel): + job_id: str = "" + jobId: str = "" + payload: dict[str, Any] = Field(default_factory=dict) + + +class JobStatusUpdateRequest(BaseModel): + status: str + error: str = "" + provider_name: str = "" + provider_task_id: str = "" + artifacts: dict[str, Any] = Field(default_factory=dict) + result: dict[str, Any] = Field(default_factory=dict) + + +def now_ts() -> int: + return int(datetime.now(timezone.utc).timestamp()) + + +def make_id(prefix: str) -> str: + return f"{prefix}_{uuid.uuid4().hex}" + + +def hash_password(password: str, salt: str) -> str: + import hashlib + + return hashlib.pbkdf2_hmac("sha256", password.encode("utf-8"), salt.encode("utf-8"), 120_000).hex() + + +def create_password_hash(password: str) -> tuple[str, str]: + salt = secrets.token_hex(16) + return hash_password(password, salt), salt + + +def verify_password(password: str, hashed: str, salt: str) -> bool: + return secrets.compare_digest(hash_password(password, salt), hashed) + + +def mask_api_key(value: str) -> str: + if not value: + return "" + if len(value) <= 8: + return "*" * len(value) + return f"{value[:4]}***{value[-4:]}" + + +def normalize_model_profile(row: dict[str, Any]) -> dict[str, Any]: + return { + "id": row["id"], + "owner_account_id": row.get("owner_account_id"), + "name": row["name"], + "provider": row["provider"], + "base_url": row["base_url"], + "api_key_masked": mask_api_key(row.get("api_key", "")), + "model_name": row["model_name"], + "is_system": bool(row.get("is_system", 0)), + "is_default": bool(row.get("is_default", 0)), + "created_at": row["created_at"], + "updated_at": row["updated_at"], + } + + +def normalize_account(row: dict[str, Any]) -> dict[str, Any]: + return { + "id": row["id"], + "username": row["username"], + "display_name": row["display_name"], + "role": row["role"], + "approval_status": row["approval_status"], + "approved_by": row.get("approved_by"), + "approved_at": row.get("approved_at"), + "preferred_analysis_model_id": row.get("preferred_analysis_model_id") or "", + "created_at": row["created_at"], + "updated_at": row["updated_at"], + } + + +def model_profile_for_account(account_id: str, requested_id: str | None) -> dict[str, Any]: + if requested_id: + row = db.fetch_one( + "SELECT * FROM model_profiles WHERE id = ? AND (owner_account_id IS NULL OR owner_account_id = ?)", + (requested_id, account_id), + ) + if row: + return row + account = db.fetch_one("SELECT preferred_analysis_model_id FROM accounts WHERE id = ?", (account_id,)) + preferred_id = (account or {}).get("preferred_analysis_model_id") or "" + if preferred_id: + row = db.fetch_one( + "SELECT * FROM model_profiles WHERE id = ? AND (owner_account_id IS NULL OR owner_account_id = ?)", + (preferred_id, account_id), + ) + if row: + return row + row = db.fetch_one("SELECT * FROM model_profiles WHERE is_default = 1 ORDER BY is_system DESC, created_at ASC LIMIT 1") + if not row: + raise HTTPException(status_code=500, detail="No model profile configured") + return row + + +def project_payload(row: dict[str, Any]) -> dict[str, Any]: + return { + "id": row["id"], + "user_id": row["user_id"], + "name": row["name"], + "description": row.get("description", ""), + "created_at": row["created_at"], + "updated_at": row["updated_at"], + } + + +def content_source_payload(row: dict[str, Any]) -> dict[str, Any]: + metadata = row.get("metadata_json") or "{}" + try: + metadata_map = json.loads(metadata) + except json.JSONDecodeError: + metadata_map = {} + return { + "id": row["id"], + "user_id": row["user_id"], + "project_id": row.get("project_id", ""), + "source_kind": row["source_kind"], + "platform": row.get("platform", ""), + "handle": row.get("handle", ""), + "source_url": row.get("source_url", ""), + "title": row.get("title", ""), + "local_path": row.get("local_path", ""), + "metadata": metadata_map, + "created_at": row["created_at"], + "updated_at": row["updated_at"], + } + + +def job_event_payload(row: dict[str, Any]) -> dict[str, Any]: + return { + "id": row["id"], + "job_id": row["job_id"], + "event_type": row["event_type"], + "payload": parse_json_object(row.get("payload_json") or "{}"), + "created_at": row["created_at"], + } + + +def ensure_default_project(account_id: str, username: str = "默认用户") -> dict[str, Any]: + project = db.fetch_one( + "SELECT * FROM projects WHERE user_id = ? ORDER BY created_at ASC LIMIT 1", + (account_id,), + ) + if project: + return project + now = utc_now() + project_id = make_id("project") + db.execute( + """ + INSERT INTO projects (id, user_id, name, description, created_at, updated_at) + VALUES (?, ?, ?, ?, ?, ?) + """, + ( + project_id, + account_id, + f"{username} 默认项目", + "系统自动创建", + now, + now, + ), + ) + return db.fetch_one("SELECT * FROM projects WHERE id = ?", (project_id,)) + + +def resolve_target_project(account_id: str, requested_project_id: str | None, username: str = "默认用户") -> dict[str, Any]: + if requested_project_id: + project = db.fetch_one( + "SELECT * FROM projects WHERE id = ? AND user_id = ?", + (requested_project_id, account_id), + ) + if project: + return project + raise HTTPException(status_code=404, detail="Project not found") + return ensure_default_project(account_id, username=username) + + +def resolve_target_assistant(account_id: str, requested_assistant_id: str | None, project_id: str = "") -> dict[str, Any] | None: + if not requested_assistant_id: + return None + assistant = db.fetch_one("SELECT * FROM assistants WHERE id = ? AND user_id = ?", (requested_assistant_id, account_id)) + if not assistant: + raise HTTPException(status_code=404, detail="Assistant not found") + if project_id and assistant.get("project_id") and assistant.get("project_id") != project_id: + raise HTTPException(status_code=400, detail="Assistant does not belong to target project") + return assistant + + +def append_job_event(job_id: str, event_type: str, payload: dict[str, Any] | None = None) -> None: + db.execute( + """ + INSERT INTO job_events (id, job_id, event_type, payload_json, created_at) + VALUES (?, ?, ?, ?, ?) + """, + ( + make_id("evt"), + job_id, + event_type, + json.dumps(payload or {}, ensure_ascii=False), + utc_now(), + ), + ) + + +def parse_json_object(raw_text: str) -> dict[str, Any]: + cleaned = raw_text.strip() + if not cleaned: + return {} + try: + data = json.loads(cleaned) + return data if isinstance(data, dict) else {} + except json.JSONDecodeError: + match = re.search(r"\{.*\}", cleaned, re.S) + if not match: + return {} + try: + data = json.loads(match.group(0)) + return data if isinstance(data, dict) else {} + except json.JSONDecodeError: + return {} + + +def knowledge_base_payload(row: dict[str, Any]) -> dict[str, Any]: + document_count = db.fetch_one( + "SELECT COUNT(*) AS count FROM knowledge_documents WHERE knowledge_base_id = ?", + (row["id"],), + )["count"] + linked_count = db.fetch_one( + "SELECT COUNT(*) AS count FROM assistant_knowledge_bases WHERE knowledge_base_id = ?", + (row["id"],), + )["count"] + return { + "id": row["id"], + "user_id": row["user_id"], + "project_id": row.get("project_id", ""), + "name": row["name"], + "description": row.get("description", ""), + "sync_status": row.get("sync_status", "ready"), + "document_count": document_count, + "linked_assistant_count": linked_count, + "created_at": row["created_at"], + "updated_at": row["updated_at"], + } + + +def assistant_payload(row: dict[str, Any]) -> dict[str, Any]: + kb_rows = db.fetch_all( + "SELECT knowledge_base_id FROM assistant_knowledge_bases WHERE assistant_id = ? ORDER BY knowledge_base_id ASC", + (row["id"],), + ) + return { + "id": row["id"], + "user_id": row["user_id"], + "project_id": row.get("project_id", ""), + "name": row["name"], + "description": row.get("description", ""), + "system_prompt": row.get("system_prompt", ""), + "generation_goal": row.get("generation_goal", ""), + "knowledge_base_ids": [item["knowledge_base_id"] for item in kb_rows], + "config": parse_json_object(row.get("config_json") or "{}"), + "model_profile_id": row.get("model_profile_id", ""), + "created_at": row["created_at"], + "updated_at": row["updated_at"], + } + + +def document_payload(row: dict[str, Any]) -> dict[str, Any]: + analysis_map = parse_json_object(row.get("analysis_json") or "{}") + source_artifacts = parse_json_object(row.get("source_artifact_json") or "{}") + storyboard_raw = row.get("storyboard_json") or "[]" + try: + storyboard_items = json.loads(storyboard_raw) + except json.JSONDecodeError: + storyboard_items = [] + return { + "id": row["id"], + "knowledge_base_id": row["knowledge_base_id"], + "title": row["title"], + "source_type": row["source_type"], + "source_url": row.get("source_url", ""), + "transcript_text": row.get("transcript_text", ""), + "style_summary": row.get("style_summary", ""), + "combined_text": row.get("combined_text", ""), + "analysis": analysis_map, + "storyboards": storyboard_items, + "source_artifacts": source_artifacts, + "analysis_model_profile_id": row.get("analysis_model_profile_id", ""), + "created_at": row["created_at"], + "updated_at": row["updated_at"], + } + + +def job_payload(row: dict[str, Any]) -> dict[str, Any]: + artifacts = row.get("artifacts_json") or "{}" + result = row.get("result_json") or "{}" + try: + artifacts_map = json.loads(artifacts) + except json.JSONDecodeError: + artifacts_map = {} + try: + result_map = json.loads(result) + except json.JSONDecodeError: + result_map = {} + return { + "id": row["id"], + "user_id": row["user_id"], + "project_id": row.get("project_id", ""), + "parent_job_id": row.get("parent_job_id", ""), + "assistant_id": row.get("assistant_id"), + "knowledge_base_id": row["knowledge_base_id"], + "content_source_id": row.get("content_source_id", ""), + "source_type": row["source_type"], + "line_type": row.get("line_type", "analysis"), + "workflow_key": row.get("workflow_key", ""), + "orchestrator": row.get("orchestrator", "n8n"), + "provider_name": row.get("provider_name", ""), + "provider_task_id": row.get("provider_task_id", ""), + "source_url": row.get("source_url"), + "title": row["title"], + "language": row.get("language", "auto"), + "status": row["status"], + "transcript_text": row.get("transcript_text", ""), + "style_summary": row.get("style_summary", ""), + "upload_status": row.get("upload_status", "pending"), + "error": row.get("error", ""), + "artifacts": artifacts_map, + "result": result_map, + "analysis_model_profile_id": row.get("analysis_model_profile_id", ""), + "created_at": row["created_at"], + "updated_at": row["updated_at"], + } + + +def require_auth(authorization: str | None = Header(default=None)) -> dict[str, Any]: + if not authorization or not authorization.startswith("Bearer "): + raise HTTPException(status_code=401, detail="Missing bearer token") + token = authorization.split(" ", 1)[1].strip() + token_row = db.fetch_one("SELECT * FROM auth_tokens WHERE token = ?", (token,)) + if not token_row: + raise HTTPException(status_code=401, detail="Invalid token") + account = db.fetch_one("SELECT * FROM accounts WHERE id = ?", (token_row["account_id"],)) + if not account: + raise HTTPException(status_code=401, detail="Account not found") + return account + + +def require_approved(account: dict[str, Any] = Depends(require_auth)) -> dict[str, Any]: + if account["approval_status"] != "approved": + raise HTTPException(status_code=403, detail="Account pending approval") + return account + + +def require_super_admin(account: dict[str, Any] = Depends(require_auth)) -> dict[str, Any]: + if account["role"] != "super_admin": + raise HTTPException(status_code=403, detail="Super admin required") + return account + + +def require_orchestrator(x_orchestrator_secret: str | None = Header(default=None)) -> bool: + if ORCHESTRATOR_SHARED_SECRET and x_orchestrator_secret != ORCHESTRATOR_SHARED_SECRET: + raise HTTPException(status_code=401, detail="Invalid orchestrator secret") + return True + + +def create_content_source( + *, + account_id: str, + project_id: str, + source_kind: str, + platform: str = "", + handle: str = "", + source_url: str = "", + title: str = "", + local_path: str = "", + metadata: dict[str, Any] | None = None, +) -> dict[str, Any]: + source_id = make_id("source") + now = utc_now() + db.execute( + """ + INSERT INTO content_sources ( + id, user_id, project_id, source_kind, platform, handle, + source_url, title, local_path, metadata_json, created_at, updated_at + ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + """, + ( + source_id, + account_id, + project_id, + source_kind, + platform, + handle, + source_url, + title, + local_path, + json.dumps(metadata or {}, ensure_ascii=False), + now, + now, + ), + ) + return db.fetch_one("SELECT * FROM content_sources WHERE id = ?", (source_id,)) + + +def merge_json_field(current_raw: str | None, updates: dict[str, Any]) -> str: + current = parse_json_object(current_raw or "{}") + current.update(updates) + return json.dumps(current, ensure_ascii=False) + + +def update_content_source_metadata(source_id: str, updates: dict[str, Any]) -> dict[str, Any]: + row = db.fetch_one("SELECT * FROM content_sources WHERE id = ?", (source_id,)) + if not row: + raise HTTPException(status_code=404, detail="Content source not found") + db.execute( + "UPDATE content_sources SET metadata_json = ?, updated_at = ? WHERE id = ?", + (merge_json_field(row.get("metadata_json") or "{}", updates), utc_now(), source_id), + ) + return db.fetch_one("SELECT * FROM content_sources WHERE id = ?", (source_id,)) + + +def update_job_state( + job_id: str, + *, + status: str, + error: str = "", + provider_name: str | None = None, + provider_task_id: str | None = None, + artifacts: dict[str, Any] | None = None, + result: dict[str, Any] | None = None, +) -> dict[str, Any]: + row = db.fetch_one("SELECT * FROM jobs WHERE id = ?", (job_id,)) + if not row: + raise HTTPException(status_code=404, detail="Job not found") + merged_artifacts = merge_json_field(row.get("artifacts_json") or "{}", artifacts or {}) + merged_result = merge_json_field(row.get("result_json") or "{}", result or {}) + db.execute( + """ + UPDATE jobs + SET status = ?, error = ?, provider_name = ?, provider_task_id = ?, + artifacts_json = ?, result_json = ?, updated_at = ? + WHERE id = ? + """, + ( + status, + error, + provider_name if provider_name is not None else row.get("provider_name", ""), + provider_task_id if provider_task_id is not None else row.get("provider_task_id", ""), + merged_artifacts, + merged_result, + utc_now(), + job_id, + ), + ) + append_job_event( + job_id, + f"job.{status}", + { + "provider_name": provider_name if provider_name is not None else row.get("provider_name", ""), + "provider_task_id": provider_task_id if provider_task_id is not None else row.get("provider_task_id", ""), + "error": error, + "artifacts": artifacts or {}, + "result": result or {}, + }, + ) + return db.fetch_one("SELECT * FROM jobs WHERE id = ?", (job_id,)) + + +def job_context_payload(row: dict[str, Any]) -> dict[str, Any]: + payload = job_payload(row) + payload["parent_job"] = None + payload["child_jobs"] = [] + payload["project"] = None + payload["assistant"] = None + payload["knowledge_base"] = None + payload["content_source"] = None + payload["events"] = [] + + if row.get("project_id"): + project = db.fetch_one("SELECT * FROM projects WHERE id = ?", (row["project_id"],)) + if project: + payload["project"] = project_payload(project) + + if row.get("assistant_id"): + assistant = db.fetch_one("SELECT * FROM assistants WHERE id = ?", (row["assistant_id"],)) + if assistant: + payload["assistant"] = assistant_payload(assistant) + + kb = db.fetch_one("SELECT * FROM knowledge_bases WHERE id = ?", (row["knowledge_base_id"],)) + if kb: + payload["knowledge_base"] = knowledge_base_payload(kb) + + if row.get("content_source_id"): + source = db.fetch_one("SELECT * FROM content_sources WHERE id = ?", (row["content_source_id"],)) + if source: + payload["content_source"] = content_source_payload(source) + + if row.get("parent_job_id"): + parent = db.fetch_one("SELECT * FROM jobs WHERE id = ?", (row["parent_job_id"],)) + if parent: + payload["parent_job"] = job_payload(parent) + + payload["child_jobs"] = [ + job_payload(item) + for item in db.fetch_all("SELECT * FROM jobs WHERE parent_job_id = ? ORDER BY created_at ASC", (row["id"],)) + ] + + payload["events"] = [ + job_event_payload(item) + for item in db.fetch_all("SELECT * FROM job_events WHERE job_id = ? ORDER BY created_at ASC", (row["id"],)) + ] + + return payload + + +async def trigger_orchestrated_job(job_row: dict[str, Any]) -> dict[str, Any]: + workflow_key = job_row.get("workflow_key") or "analysis_pipeline" + if not n8n_client.enabled: + raise HTTPException(status_code=503, detail="n8n is not configured") + append_job_event(job_row["id"], "workflow.trigger.requested", {"workflow_key": workflow_key}) + update_job_state( + job_row["id"], + status="queued", + provider_name="n8n", + provider_task_id="", + result={"n8n_trigger": {"requested": True}}, + ) + trigger_result = await n8n_client.trigger( + workflow_key, + { + "jobId": job_row["id"], + "job_id": job_row["id"], + "workflowKey": workflow_key, + "workflow_key": workflow_key, + "lineType": job_row.get("line_type", "analysis"), + "line_type": job_row.get("line_type", "analysis"), + }, + ) + provider_task_id = str(trigger_result.get("executionId") or "") + db.execute( + """ + UPDATE jobs + SET provider_name = ?, provider_task_id = ?, result_json = ?, updated_at = ? + WHERE id = ? + """, + ( + "n8n", + provider_task_id, + merge_json_field( + db.fetch_one("SELECT result_json FROM jobs WHERE id = ?", (job_row["id"],)).get("result_json") or "{}", + {"n8n_trigger": trigger_result}, + ), + utc_now(), + job_row["id"], + ), + ) + append_job_event( + job_row["id"], + "workflow.trigger.accepted", + {"provider_task_id": provider_task_id, "trigger_result": trigger_result}, + ) + return db.fetch_one("SELECT * FROM jobs WHERE id = ?", (job_row["id"],)) + + +async def call_model(profile: dict[str, Any], system_prompt: str, user_prompt: str, temperature: float = 0.4) -> str: + try: + content = await openai_client.chat_completion( + base_url=profile["base_url"], + api_key=profile.get("api_key", ""), + model=profile["model_name"], + system_prompt=system_prompt, + user_prompt=user_prompt, + temperature=temperature, + ) + if content: + return content.strip() + except Exception: + pass + excerpt = user_prompt.strip().replace("\n", " ")[:220] + return f"风格摘要:内容以强结论开头,节奏偏短句,强调冲突转折和行动指令。素材摘要:{excerpt}" + + +async def summarize_style(profile: dict[str, Any], transcript_text: str, title: str) -> str: + prompt = ( + f"标题:{title}\n\n" + f"素材全文:\n{transcript_text}\n\n" + "请提炼这段素材的文案风格、结构节奏、开头钩子、情绪推进、收尾 CTA,并给出可复用的学习结论。" + ) + system_prompt = "你是短视频文案拆解师,输出简洁、结构化、适合沉淀进知识库。" + return await call_model(profile, system_prompt, prompt, temperature=0.3) + + +async def generate_content_blueprint( + profile: dict[str, Any], + *, + title: str, + transcript_text: str, + style_summary: str, + agent_prompt: str = "", + generation_goal: str = "", +) -> dict[str, Any]: + system_prompt = ( + "你是短视频内容策略师。" + "必须输出 JSON 对象,不要输出 Markdown,不要输出多余解释。" + ) + user_prompt = ( + f"标题:{title}\n\n" + f"素材转写:\n{transcript_text}\n\n" + f"风格拆解:\n{style_summary}\n\n" + f"智能体补充约束:\n{agent_prompt or '无'}\n\n" + f"生成目标:\n{generation_goal or '围绕原素材做二创短视频'}\n\n" + "请输出如下 JSON 结构:" + "{" + '"analysis":{"hook":"","structure":[],"style_tags":[],"cta":""},' + '"rewrite":{"title":"","script":"","summary":""},' + '"storyboards":[' + '{"shot_index":1,"title":"","narration":"","visual":"","first_frame_prompt":"","last_frame_prompt":"","video_prompt":"","duration_sec":5}' + "]" + "}" + ) + raw = await call_model(profile, system_prompt, user_prompt, temperature=0.5) + parsed = parse_json_object(raw) + if parsed.get("storyboards"): + return parsed + + fallback_storyboards: list[dict[str, Any]] = [] + paragraphs = [part.strip() for part in transcript_text.split("\n") if part.strip()] + seed_segments = paragraphs[:4] or [transcript_text[:1200]] + for idx, segment in enumerate(seed_segments, start=1): + snippet = segment[:180] + fallback_storyboards.append( + { + "shot_index": idx, + "title": f"镜头{idx}", + "narration": snippet, + "visual": f"围绕这段内容构建具象画面:{snippet}", + "first_frame_prompt": f"短视频首帧,突出主题:{snippet}", + "last_frame_prompt": f"短视频尾帧,强化结论和行动指令:{snippet}", + "video_prompt": f"基于首尾帧生成连贯镜头,内容是:{snippet}", + "duration_sec": 5, + } + ) + + return { + "analysis": { + "hook": title, + "structure": ["结论开场", "核心论点", "例证推进", "收尾行动"], + "style_tags": ["短句", "结论先行", "强 CTA"], + "cta": "引导用户采取下一步行动", + }, + "rewrite": { + "title": title, + "script": transcript_text[:3000], + "summary": style_summary[:500], + }, + "storyboards": fallback_storyboards, + } + + +def fallback_transcript_from_text(title: str, content: str) -> str: + return f"标题:{title}\n\n正文:\n{content.strip()}" + + +def infer_platform_from_url(source_url: str) -> str: + normalized = source_url.strip().lower() + if "bilibili.com" in normalized or "b23.tv" in normalized: + return "bilibili" + if "douyin.com" in normalized or "iesdouyin.com" in normalized: + return "douyin" + if "xiaohongshu.com" in normalized or "xhslink.com" in normalized: + return "xiaohongshu" + if "youtube.com" in normalized or "youtu.be" in normalized: + return "youtube" + return "" + + +def command_exists(name: str) -> bool: + return shutil.which(name) is not None + + +def run_command(command: list[str], cwd: Path | None = None, timeout: float | None = None) -> tuple[int, str, str]: + try: + proc = subprocess.run( + command, + cwd=str(cwd) if cwd else None, + capture_output=True, + text=True, + timeout=timeout, + ) + return proc.returncode, proc.stdout, proc.stderr + except subprocess.TimeoutExpired as exc: + stdout = exc.stdout if isinstance(exc.stdout, str) else (exc.stdout or b"").decode("utf-8", errors="ignore") + stderr = exc.stderr if isinstance(exc.stderr, str) else (exc.stderr or b"").decode("utf-8", errors="ignore") + detail = stderr or f"Command timed out after {timeout} seconds" + return 124, stdout, detail + + +def discover_account_video_links(source_url: str, max_items: int) -> tuple[list[dict[str, Any]], dict[str, Any]]: + if not command_exists(YTDLP_BIN): + raise HTTPException(status_code=503, detail="yt-dlp is not configured") + + discovery_cmd = [ + YTDLP_BIN, + "--flat-playlist", + "--playlist-end", + str(max_items), + "--print", + "%(webpage_url)s\t%(title)s\t%(id)s", + source_url, + ] + code, stdout, stderr = run_command(discovery_cmd, timeout=180) + raw_lines = [line.strip() for line in stdout.splitlines() if line.strip()] + items: list[dict[str, Any]] = [] + seen_urls: set[str] = set() + for line in raw_lines: + parts = line.split("\t") + video_url = parts[0].strip() if parts else "" + raw_title = parts[1].strip() if len(parts) > 1 else "" + raw_external_id = parts[2].strip() if len(parts) > 2 else "" + if not video_url or video_url == "NA" or video_url in seen_urls: + continue + seen_urls.add(video_url) + items.append( + { + "video_url": video_url, + "title": raw_title if raw_title and raw_title != "NA" else "短视频素材", + "external_id": raw_external_id if raw_external_id != "NA" else "", + } + ) + + debug_payload = { + "discovery_command": discovery_cmd, + "discovery_stdout_preview": raw_lines[: min(len(raw_lines), max_items)], + "discovery_stderr": stderr.strip()[:1000], + "discovery_exit_code": code, + } + if code != 0: + raise HTTPException(status_code=502, detail=f"Failed to inspect content source: {stderr.strip()[:200] or 'yt-dlp error'}") + return items, debug_payload + + +def validate_real_cut_source_job(source_job: dict[str, Any]) -> None: + source_type = source_job.get("source_type", "") + if source_type not in {"upload_video", "video_link"}: + raise HTTPException(status_code=400, detail="Real-cut source job must come from upload_video or video_link") + if source_type == "video_link" and source_job.get("status") != "completed": + raise HTTPException(status_code=409, detail="Video link source job must be completed before real-cut staging") + + +def resolve_real_cut_source_file(source_job: dict[str, Any]) -> tuple[Path, dict[str, Any] | None]: + validate_real_cut_source_job(source_job) + artifacts = parse_job_artifacts(source_job) + candidates: list[Path] = [] + + if artifacts.get("uploaded_path"): + candidates.append(Path(str(artifacts["uploaded_path"]))) + if artifacts.get("source_path"): + candidates.append(Path(str(artifacts["source_path"]))) + if source_job.get("content_source_id"): + source_row = db.fetch_one("SELECT * FROM content_sources WHERE id = ?", (source_job["content_source_id"],)) + if source_row and source_row.get("local_path"): + candidates.append(Path(str(source_row["local_path"]))) + if source_job.get("source_type") == "video_link": + candidates.append(JOBS_DIR / source_job["id"] / "source.mp4") + + seen: set[str] = set() + for candidate in candidates: + candidate_str = str(candidate) + if not candidate_str or candidate_str in seen: + continue + seen.add(candidate_str) + if candidate.exists() and candidate.is_file(): + return candidate, artifacts + + raise HTTPException(status_code=409, detail="Source job media file is not available for real-cut staging") + + +async def stage_real_cut_source_to_cutvideo(source_job: dict[str, Any]) -> dict[str, Any]: + if not cutvideo_client.enabled: + raise HTTPException(status_code=503, detail="CutVideo is not configured") + + source_path, source_artifacts = resolve_real_cut_source_file(source_job) + folder_name = f"storyforge-{source_job['id']}" + upload_payload = await cutvideo_client.upload_source_file(source_path, folder_name=folder_name) + input_dir = str(upload_payload.get("input_dir") or "").strip() + if not input_dir: + raise HTTPException(status_code=502, detail="CutVideo upload did not return input_dir") + return { + "input_dir": input_dir, + "source_path": str(source_path), + "upload": upload_payload, + "source_artifacts": source_artifacts, + } + + +def create_job_record( + *, + account_id: str, + project_id: str, + knowledge_base_id: str, + parent_job_id: str | None = None, + source_type: str, + line_type: str, + workflow_key: str, + title: str, + language: str = "auto", + source_url: str = "", + assistant_id: str | None = None, + content_source_id: str | None = None, + artifacts: dict[str, Any] | None = None, + analysis_model_profile_id: str = "", +) -> dict[str, Any]: + job_id = make_id("job") + now = utc_now() + db.execute( + """ + INSERT INTO jobs ( + id, user_id, project_id, parent_job_id, assistant_id, knowledge_base_id, content_source_id, + source_type, line_type, workflow_key, orchestrator, provider_name, provider_task_id, + source_url, title, language, status, transcript_text, style_summary, upload_status, + error, artifacts_json, result_json, analysis_model_profile_id, created_at, updated_at + ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, 'n8n', '', '', ?, ?, ?, 'pending', '', '', 'pending', '', ?, '{}', ?, ?, ?) + """, + ( + job_id, + account_id, + project_id, + parent_job_id, + assistant_id, + knowledge_base_id, + content_source_id, + source_type, + line_type, + workflow_key, + source_url or None, + title, + language, + json.dumps(artifacts or {}, ensure_ascii=False), + analysis_model_profile_id, + now, + now, + ), + ) + return db.fetch_one("SELECT * FROM jobs WHERE id = ?", (job_id,)) + + +async def wait_for_huobao_image(image_id: str | int) -> dict[str, Any]: + deadline = now_ts() + HUOBAO_MAX_WAIT_SEC + last_payload: dict[str, Any] = {} + while True: + last_payload = await huobao_client.get_image(str(image_id)) + status = str(last_payload.get("status") or "").lower() + if status in {"completed", "failed"}: + return last_payload + if now_ts() >= deadline: + raise RuntimeError(f"Huobao image task timed out: {image_id}") + await asyncio.sleep(HUOBAO_POLL_INTERVAL_SEC) + + +async def wait_for_huobao_video(video_id: str | int) -> dict[str, Any]: + deadline = now_ts() + HUOBAO_MAX_WAIT_SEC + last_payload: dict[str, Any] = {} + while True: + last_payload = await huobao_client.get_video(str(video_id)) + status = str(last_payload.get("status") or "").lower() + if status in {"completed", "failed"}: + return last_payload + if now_ts() >= deadline: + raise RuntimeError(f"Huobao video task timed out: {video_id}") + await asyncio.sleep(HUOBAO_POLL_INTERVAL_SEC) + + +def coerce_storyboards(items: Any) -> list[dict[str, Any]]: + if not isinstance(items, list): + return [] + return [item for item in items if isinstance(item, dict)] + + +def huobao_image_size_for_aspect_ratio(aspect_ratio: str) -> str: + normalized = str(aspect_ratio or "").strip() + if normalized == "9:16": + return "1024x1536" + if normalized == "16:9": + return "1536x1024" + if normalized == "1:1": + return "1024x1024" + return "1024x1536" + + +async def transcribe_media(job_dir: Path, source_path: Path, title: str, source_url: str = "") -> tuple[str, dict[str, Any]]: + artifacts: dict[str, Any] = {} + transcript = "" + media_path = source_path + artifacts["source_path"] = str(media_path) + + if not source_path.exists(): + transcript = ( + f"素材标题:{title}\n" + f"素材来源:{source_url or source_path.name}\n\n" + "当前环境未找到可直接处理的本地视频文件,已记录来源信息并进入降级学习流程。" + ) + return transcript, artifacts + + audio_path = job_dir / "audio.wav" + if command_exists(FFMPEG_BIN): + code, _, err = run_command([FFMPEG_BIN, "-y", "-i", str(source_path), "-ar", "16000", "-ac", "1", str(audio_path)]) + if code == 0 and audio_path.exists(): + artifacts["audio_path"] = str(audio_path) + media_path = audio_path + elif err: + artifacts["ffmpeg_error"] = err.strip()[:500] + + if asr_http_client.enabled and media_path.exists(): + try: + asr_payload = await asr_http_client.transcribe_audio(media_path) + artifacts["asr_http_payload"] = { + "success": bool(asr_payload.get("success", True)), + "duration_ms": asr_payload.get("duration_ms"), + "error_message": str(asr_payload.get("error_message") or "")[:500], + } + transcript = str(asr_payload.get("text") or "").strip() + if transcript: + artifacts["asr_backend"] = "http" + except Exception as exc: + error_detail = str(exc).strip() or exc.__class__.__name__ + artifacts["asr_http_error"] = error_detail[:500] + + if WHISPER_BIN and Path(WHISPER_BIN).exists() and Path(WHISPER_MODEL).exists(): + out_prefix = job_dir / "whisper" + code, stdout, stderr = run_command([ + WHISPER_BIN, + "-m", + WHISPER_MODEL, + "-f", + str(media_path), + "-otxt", + "-of", + str(out_prefix), + ]) + txt_path = Path(str(out_prefix) + ".txt") + if code == 0 and txt_path.exists(): + cli_transcript = txt_path.read_text(encoding="utf-8", errors="ignore").strip() + if cli_transcript: + transcript = cli_transcript + artifacts["transcript_path"] = str(txt_path) + artifacts["asr_backend"] = artifacts.get("asr_backend") or "whisper_cli" + else: + artifacts["whisper_stdout"] = stdout.strip()[:500] + artifacts["whisper_error"] = stderr.strip()[:500] + + if not transcript: + transcript = ( + f"素材标题:{title}\n" + f"素材来源:{source_url or source_path.name}\n\n" + "当前环境未完成真实 ASR,已保留原始素材供后续转写。请结合标题、来源和上下文进行初步风格学习。" + ) + return transcript, artifacts + + +def ensure_user_kb(account_id: str, project_id: str = "", username: str = "默认用户") -> dict[str, Any]: + project = resolve_target_project(account_id, project_id or None, username=username) + row = db.fetch_one( + "SELECT * FROM knowledge_bases WHERE user_id = ? AND project_id = ? ORDER BY created_at ASC LIMIT 1", + (account_id, project["id"]), + ) + if row: + return row + kb_id = make_id("kb") + now = utc_now() + db.execute( + """ + INSERT INTO knowledge_bases (id, user_id, project_id, name, description, sync_status, created_at, updated_at) + VALUES (?, ?, ?, ?, ?, ?, ?, ?) + """, + (kb_id, account_id, project["id"], "默认知识库", "系统为新用户自动创建", "ready", now, now), + ) + return db.fetch_one("SELECT * FROM knowledge_bases WHERE id = ?", (kb_id,)) + + +async def process_job(job_id: str) -> None: + row = db.fetch_one("SELECT * FROM jobs WHERE id = ?", (job_id,)) + if not row: + return + now = utc_now() + db.execute("UPDATE jobs SET status = ?, updated_at = ? WHERE id = ?", ("processing", now, job_id)) + append_job_event(job_id, "job.processing", {}) + + try: + artifacts = json.loads(row.get("artifacts_json") or "{}") + transcript_text = row.get("transcript_text", "") + job_dir = JOBS_DIR / job_id + job_dir.mkdir(parents=True, exist_ok=True) + + if row["source_type"] == "text": + transcript_text = fallback_transcript_from_text(row["title"], artifacts.get("input_text", "")) + elif row["source_type"] == "video_link": + downloaded = job_dir / "source.mp4" + if command_exists(YTDLP_BIN): + code, stdout, stderr = run_command([ + YTDLP_BIN, + "--no-playlist", + "-o", + str(downloaded), + row.get("source_url") or "", + ], cwd=job_dir) + if code == 0 and downloaded.exists(): + artifacts["download_stdout"] = stdout.strip()[:500] + else: + artifacts["download_error"] = stderr.strip()[:500] + transcript_text, extra = await transcribe_media(job_dir, downloaded if downloaded.exists() else job_dir / "placeholder.mp4", row["title"], row.get("source_url") or "") + artifacts.update(extra) + elif row["source_type"] == "upload_video": + source_path = Path(artifacts.get("uploaded_path", "")) + transcript_text, extra = await transcribe_media(job_dir, source_path, row["title"], row.get("source_url") or "") + artifacts.update(extra) + + profile = model_profile_for_account(row["user_id"], row.get("analysis_model_profile_id") or None) + style_summary = await summarize_style(profile, transcript_text, row["title"]) + assistant = None + if row.get("assistant_id"): + assistant = db.fetch_one("SELECT * FROM assistants WHERE id = ?", (row["assistant_id"],)) + content_blueprint = await generate_content_blueprint( + profile, + title=row["title"], + transcript_text=transcript_text, + style_summary=style_summary, + agent_prompt=(assistant or {}).get("system_prompt", ""), + generation_goal=(assistant or {}).get("generation_goal", ""), + ) + combined_text = ( + f"{transcript_text}\n\n" + "------\n" + f"风格学习结论:\n{style_summary}\n\n" + "------\n" + f"二创文案:\n{(content_blueprint.get('rewrite') or {}).get('script', '')}\n\n" + "------\n" + f"分镜:\n{json.dumps(content_blueprint.get('storyboards') or [], ensure_ascii=False, indent=2)}" + ) + kb_row = db.fetch_one("SELECT * FROM knowledge_bases WHERE id = ?", (row["knowledge_base_id"],)) + if not kb_row: + raise RuntimeError("Knowledge base not found") + document_id = make_id("doc") + timestamp = utc_now() + db.execute( + """ + INSERT INTO knowledge_documents ( + id, knowledge_base_id, title, source_type, source_url, transcript_text, + style_summary, combined_text, analysis_json, storyboard_json, source_artifact_json, + analysis_model_profile_id, created_at, updated_at + ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + """, + ( + document_id, + row["knowledge_base_id"], + row["title"], + row["source_type"], + row.get("source_url") or "", + transcript_text, + style_summary, + combined_text, + json.dumps(content_blueprint.get("analysis") or {}, ensure_ascii=False), + json.dumps(content_blueprint.get("storyboards") or [], ensure_ascii=False), + json.dumps(artifacts, ensure_ascii=False), + profile["id"], + timestamp, + timestamp, + ), + ) + update_job_state( + job_id, + status="completed", + artifacts={ + "document_id": document_id, + "project_job_dir": str(job_dir), + **artifacts, + }, + result={ + "analysis": content_blueprint.get("analysis") or {}, + "rewrite": content_blueprint.get("rewrite") or {}, + "storyboards": content_blueprint.get("storyboards") or [], + "document_id": document_id, + }, + ) + db.execute( + """ + UPDATE jobs + SET transcript_text = ?, style_summary = ?, upload_status = ?, updated_at = ? + WHERE id = ? + """, + (transcript_text, style_summary, "ready", timestamp, job_id), + ) + db.execute( + "UPDATE knowledge_bases SET sync_status = ?, updated_at = ? WHERE id = ?", + ("ready", timestamp, kb_row["id"]), + ) + except Exception as exc: + update_job_state(job_id, status="failed", error=str(exc)) + + +@app.on_event("startup") +def on_startup() -> None: + db.init_schema() + seed_defaults() + + +def probe_tcp(url: str, timeout: float = 3.0) -> dict[str, Any]: + if not url: + return {"configured": False, "reachable": False, "status_code": 0, "error": "not_configured", "url": ""} + parsed = urlparse(url) + host = parsed.hostname + port = parsed.port or (443 if parsed.scheme == "https" else 80) + if not host: + return {"configured": True, "reachable": False, "status_code": 0, "error": "invalid_url", "url": url} + sock = socket.socket() + sock.settimeout(timeout) + try: + sock.connect((host, port)) + return {"configured": True, "reachable": True, "status_code": 0, "error": "", "url": url} + except Exception as exc: # pragma: no cover - operational probe + return {"configured": True, "reachable": False, "status_code": 0, "error": str(exc), "url": url} + finally: + sock.close() + + +def probe_http(url: str, path: str = "", timeout: float = 3.0) -> dict[str, Any]: + tcp = probe_tcp(url, timeout=timeout) + target_url = urljoin(url if url.endswith("/") else f"{url}/", path.lstrip("/")) if url else "" + if not tcp["configured"] or not tcp["reachable"]: + if target_url: + tcp["url"] = target_url + return tcp + try: + response = httpx.get(target_url or url, timeout=timeout, follow_redirects=True) + tcp["status_code"] = response.status_code + tcp["reachable"] = response.status_code < 500 + tcp["error"] = "" if response.status_code < 500 else f"http_{response.status_code}" + except Exception as exc: # pragma: no cover - operational probe + tcp["reachable"] = False + tcp["error"] = str(exc) + tcp["url"] = target_url or url + return tcp + + +def local_model_public_base_url() -> str: + if not LOCAL_OPENAI_BASE_URL: + return "" + parsed = urlparse(LOCAL_OPENAI_BASE_URL) + scheme = parsed.scheme or "http" + host = parsed.hostname or "127.0.0.1" + if host in {"host.docker.internal", "localhost"}: + host = "127.0.0.1" + port = parsed.port + root = f"{scheme}://{host}" + if port: + root = f"{root}:{port}" + return root + + +def fetch_local_model_catalog(timeout: float = 8.0) -> dict[str, Any]: + detail = probe_http(LOCAL_OPENAI_BASE_URL, "/models", timeout=timeout) + public_base_url = local_model_public_base_url() + management_url = f"{public_base_url}/management.html" if public_base_url else "" + payload = { + "configured": detail.get("configured", False), + "reachable": detail.get("reachable", False), + "base_url": LOCAL_OPENAI_BASE_URL, + "public_base_url": public_base_url, + "management_url": management_url, + "default_model": LOCAL_OPENAI_MODEL, + "models": [], + "status_code": detail.get("status_code", 0), + "error": detail.get("error", ""), + "url": detail.get("url", ""), + } + if not detail.get("configured") or not detail.get("reachable"): + return payload + try: + response = httpx.get( + urljoin(LOCAL_OPENAI_BASE_URL if LOCAL_OPENAI_BASE_URL.endswith("/") else f"{LOCAL_OPENAI_BASE_URL}/", "models"), + timeout=timeout, + ) + response.raise_for_status() + data = response.json() + payload["models"] = [ + { + "id": item.get("id", ""), + "owned_by": item.get("owned_by", ""), + "created": item.get("created", 0), + } + for item in (data.get("data") or []) + if isinstance(item, dict) + ] + except Exception as exc: # pragma: no cover - operational probe + payload["reachable"] = False + payload["error"] = str(exc) + return payload + + +@app.get("/healthz") +def healthz() -> dict[str, Any]: + return { + "status": "ok", + "dbPath": DB_PATH, + "defaultExternalBaseUrl": DEFAULT_EXTERNAL_BASE_URL, + "localModelBaseUrl": LOCAL_OPENAI_BASE_URL, + "asrHttpBaseUrl": ASR_HTTP_BASE_URL, + "n8nBaseUrl": N8N_BASE_URL, + "cutvideoBaseUrl": CUTVIDEO_BASE_URL, + "cutvideoUploadTimeoutSec": CUTVIDEO_UPLOAD_TIMEOUT_SEC, + "huobaoBaseUrl": HUOBAO_BASE_URL, + } + + +@app.get("/v2/integrations/health") +def integrations_health(account: dict[str, Any] = Depends(require_approved)) -> dict[str, Any]: + _ = account + return { + "local_model": { + "base_url": LOCAL_OPENAI_BASE_URL, + **probe_http(LOCAL_OPENAI_BASE_URL, "/models"), + }, + "cutvideo": { + "base_url": CUTVIDEO_BASE_URL, + **probe_http(CUTVIDEO_BASE_URL, "/api/bootstrap"), + }, + "huobao": { + "base_url": HUOBAO_BASE_URL, + **probe_http(HUOBAO_BASE_URL, "/health"), + }, + "n8n": { + "base_url": N8N_BASE_URL, + **probe_http(N8N_BASE_URL, "/healthz"), + }, + "asr": { + "base_url": ASR_HTTP_BASE_URL, + **probe_tcp(ASR_HTTP_BASE_URL), + }, + } + + +@app.get("/v2/integrations/local-models") +def integrations_local_models(account: dict[str, Any] = Depends(require_approved)) -> dict[str, Any]: + _ = account + return fetch_local_model_catalog() + + +def seed_defaults() -> None: + if not db.fetch_one("SELECT id FROM model_profiles WHERE is_default = 1 LIMIT 1"): + profile_id = make_id("model") + now = utc_now() + db.execute( + """ + INSERT INTO model_profiles (id, owner_account_id, name, provider, base_url, api_key, model_name, is_system, is_default, created_at, updated_at) + VALUES (?, NULL, ?, ?, ?, ?, ?, 1, 1, ?, ?) + """, + ( + profile_id, + "本机默认模型", + "openai_compat", + LOCAL_OPENAI_BASE_URL, + LOCAL_OPENAI_API_KEY, + LOCAL_OPENAI_MODEL, + now, + now, + ), + ) + if not db.fetch_one("SELECT id FROM accounts WHERE username = ?", ("kris",)): + account_id = make_id("acct") + password_hash, password_salt = create_password_hash("Asd123456.") + now = utc_now() + model_row = db.fetch_one("SELECT id FROM model_profiles WHERE is_default = 1 LIMIT 1") + db.execute( + """ + INSERT INTO accounts ( + id, username, password_hash, password_salt, display_name, role, + approval_status, approved_by, approved_at, preferred_analysis_model_id, + created_at, updated_at + ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + """, + ( + account_id, + "kris", + password_hash, + password_salt, + "Kris", + "super_admin", + "approved", + account_id, + now, + model_row["id"] if model_row else "", + now, + now, + ), + ) + project = ensure_default_project(account_id, username="kris") + kb = ensure_user_kb(account_id, project["id"], username="kris") + assistant_id = make_id("assistant") + db.execute( + """ + INSERT INTO assistants (id, user_id, project_id, name, description, system_prompt, generation_goal, config_json, model_profile_id, created_at, updated_at) + VALUES (?, ?, ?, ?, ?, ?, ?, '{}', ?, ?, ?) + """, + ( + assistant_id, + account_id, + project["id"], + "默认文案助手", + "系统为超级管理员预置", + "你是一个擅长学习短视频文案风格的 AI 助手。", + "为用户生成稳定风格的短视频文案。", + model_row["id"] if model_row else "", + now, + now, + ), + ) + db.execute( + "INSERT INTO assistant_knowledge_bases (assistant_id, knowledge_base_id) VALUES (?, ?)", + (assistant_id, kb["id"]), + ) + + +@app.post("/v2/auth/register") +def register(request: RegisterAccountRequest) -> dict[str, Any]: + username = request.username.strip() + password = request.password.strip() + display_name = request.display_name.strip() or username + if not username or not password: + raise HTTPException(status_code=400, detail="username and password are required") + if db.fetch_one("SELECT id FROM accounts WHERE username = ?", (username,)): + raise HTTPException(status_code=409, detail="username already exists") + account_id = make_id("acct") + password_hash, password_salt = create_password_hash(password) + now = utc_now() + default_model = db.fetch_one("SELECT id FROM model_profiles WHERE is_default = 1 LIMIT 1") + db.execute( + """ + INSERT INTO accounts ( + id, username, password_hash, password_salt, display_name, role, + approval_status, approved_by, approved_at, preferred_analysis_model_id, + created_at, updated_at + ) VALUES (?, ?, ?, ?, ?, 'user', 'pending', NULL, NULL, ?, ?, ?) + """, + ( + account_id, + username, + password_hash, + password_salt, + display_name, + default_model["id"] if default_model else "", + now, + now, + ), + ) + ensure_default_project(account_id, username=username) + account = db.fetch_one("SELECT * FROM accounts WHERE id = ?", (account_id,)) + return normalize_account(account) + + +@app.post("/v2/auth/login") +def login(request: LoginRequest) -> dict[str, Any]: + account = db.fetch_one("SELECT * FROM accounts WHERE username = ?", (request.username.strip(),)) + if not account or not verify_password(request.password, account["password_hash"], account["password_salt"]): + raise HTTPException(status_code=401, detail="Invalid credentials") + token = secrets.token_urlsafe(32) + db.execute( + "INSERT INTO auth_tokens (token, account_id, created_at) VALUES (?, ?, ?)", + (token, account["id"], utc_now()), + ) + return { + "token": token, + "account": normalize_account(account), + "default_external_base_url": DEFAULT_EXTERNAL_BASE_URL, + } + + +@app.post("/v2/auth/logout") +def logout(account: dict[str, Any] = Depends(require_auth), authorization: str | None = Header(default=None)) -> dict[str, bool]: + token = authorization.split(" ", 1)[1].strip() + db.execute("DELETE FROM auth_tokens WHERE token = ?", (token,)) + return {"saved": True} + + +@app.get("/v2/me") +def me(account: dict[str, Any] = Depends(require_auth)) -> dict[str, Any]: + return normalize_account(account) + + +@app.get("/v2/me/dashboard") +def dashboard(account: dict[str, Any] = Depends(require_approved)) -> dict[str, Any]: + projects = [project_payload(row) for row in db.fetch_all("SELECT * FROM projects WHERE user_id = ? ORDER BY created_at ASC", (account["id"],))] + knowledge_bases = [knowledge_base_payload(row) for row in db.fetch_all("SELECT * FROM knowledge_bases WHERE user_id = ? ORDER BY created_at DESC", (account["id"],))] + assistants = [assistant_payload(row) for row in db.fetch_all("SELECT * FROM assistants WHERE user_id = ? ORDER BY created_at DESC", (account["id"],))] + jobs = [job_payload(row) for row in db.fetch_all("SELECT * FROM jobs WHERE user_id = ? ORDER BY created_at DESC LIMIT 20", (account["id"],))] + model_profiles = [normalize_model_profile(row) for row in db.fetch_all("SELECT * FROM model_profiles WHERE owner_account_id IS NULL OR owner_account_id = ? ORDER BY is_default DESC, created_at ASC", (account["id"],))] + return { + "account": normalize_account(account), + "projects": projects, + "knowledge_bases": knowledge_bases, + "assistants": assistants, + "recent_jobs": jobs, + "model_profiles": model_profiles, + } + + +@app.get("/v2/projects") +def list_projects(account: dict[str, Any] = Depends(require_approved)) -> list[dict[str, Any]]: + return [project_payload(row) for row in db.fetch_all("SELECT * FROM projects WHERE user_id = ? ORDER BY created_at ASC", (account["id"],))] + + +@app.post("/v2/projects") +def create_project(request: ProjectCreateRequest, account: dict[str, Any] = Depends(require_approved)) -> dict[str, Any]: + project_id = make_id("project") + now = utc_now() + db.execute( + """ + INSERT INTO projects (id, user_id, name, description, created_at, updated_at) + VALUES (?, ?, ?, ?, ?, ?) + """, + ( + project_id, + account["id"], + request.name.strip(), + request.description.strip(), + now, + now, + ), + ) + ensure_user_kb(account["id"], project_id, username=account["username"]) + return project_payload(db.fetch_one("SELECT * FROM projects WHERE id = ?", (project_id,))) + + +@app.get("/v2/content-sources") +def list_content_sources( + project_id: str | None = Query(default=None), + account: dict[str, Any] = Depends(require_approved), +) -> list[dict[str, Any]]: + if project_id: + resolve_target_project(account["id"], project_id, username=account["username"]) + rows = db.fetch_all( + "SELECT * FROM content_sources WHERE user_id = ? AND project_id = ? ORDER BY created_at DESC", + (account["id"], project_id), + ) + else: + rows = db.fetch_all("SELECT * FROM content_sources WHERE user_id = ? ORDER BY created_at DESC", (account["id"],)) + return [content_source_payload(row) for row in rows] + + +@app.post("/v2/content-sources") +def create_content_source_api(request: ContentSourceCreateRequest, account: dict[str, Any] = Depends(require_approved)) -> dict[str, Any]: + project = resolve_target_project(account["id"], request.project_id or None, username=account["username"]) + row = create_content_source( + account_id=account["id"], + project_id=project["id"], + source_kind=request.source_kind.strip(), + platform=request.platform.strip(), + handle=request.handle.strip(), + source_url=request.source_url.strip(), + title=request.title.strip(), + local_path=request.local_path.strip(), + metadata=request.metadata, + ) + return content_source_payload(row) + + +@app.post("/v2/pipelines/content-source-sync") +async def create_content_source_sync_job( + request: ContentSourceSyncRequest, + account: dict[str, Any] = Depends(require_approved), +) -> dict[str, Any]: + source_row = None + if request.content_source_id.strip(): + source_row = load_owned_content_source(request.content_source_id.strip(), account["id"]) + + requested_project_id = request.project_id or (source_row.get("project_id", "") if source_row else "") + project = resolve_target_project(account["id"], requested_project_id or None, username=account["username"]) + kb = resolve_target_kb(account["id"], request.knowledge_base_id or None, project["id"], username=account["username"]) + assistant = resolve_target_assistant(account["id"], request.assistant_id or None, project["id"]) + profile = model_profile_for_account(account["id"], request.analysis_model_profile_id or None) + + source_url = (request.source_url or (source_row or {}).get("source_url") or "").strip() + if not source_url: + raise HTTPException(status_code=400, detail="source_url or content_source_id is required") + platform = (request.platform or (source_row or {}).get("platform") or infer_platform_from_url(source_url)).strip() + handle = (request.handle or (source_row or {}).get("handle") or "").strip() + source_title = ( + request.title.strip() + or (source_row or {}).get("title", "").strip() + or handle + or source_url + ) + + if source_row and source_row.get("project_id") and source_row.get("project_id") != project["id"]: + raise HTTPException(status_code=400, detail="Content source does not belong to target project") + + if not source_row: + source_row = create_content_source( + account_id=account["id"], + project_id=project["id"], + source_kind="creator_account", + platform=platform, + handle=handle, + source_url=source_url, + title=source_title, + metadata={ + "sync_mode": "recent_uploads", + "max_items": request.max_items, + "analysis_model_profile_id": profile["id"], + }, + ) + + job_row = create_job_record( + account_id=account["id"], + project_id=project["id"], + knowledge_base_id=kb["id"], + source_type="content_source_sync", + line_type="content_source_sync", + workflow_key="content_source_sync_pipeline", + title=f"{source_title} 内容源同步", + language=request.language, + source_url=source_url, + assistant_id=(assistant or {}).get("id"), + content_source_id=source_row["id"], + artifacts={ + "platform": platform, + "handle": handle, + "source_account_url": source_url, + "source_title": source_title, + "max_items": request.max_items, + "skip_existing": request.skip_existing, + "auto_trigger_analysis": request.auto_trigger_analysis, + }, + analysis_model_profile_id=profile["id"], + ) + update_content_source_metadata( + source_row["id"], + { + "sync_mode": "recent_uploads", + "max_items": request.max_items, + "analysis_model_profile_id": profile["id"], + "last_sync_job_id": job_row["id"], + "last_sync_requested_at": utc_now(), + }, + ) + return job_payload(await trigger_orchestrated_job(job_row)) + + +@app.get("/v2/model-profiles") +def list_model_profiles(account: dict[str, Any] = Depends(require_approved)) -> list[dict[str, Any]]: + rows = db.fetch_all( + "SELECT * FROM model_profiles WHERE owner_account_id IS NULL OR owner_account_id = ? ORDER BY is_default DESC, is_system DESC, created_at ASC", + (account["id"],), + ) + return [normalize_model_profile(row) for row in rows] + + +@app.post("/v2/model-profiles") +def create_model_profile(request: ModelProfileRequest, account: dict[str, Any] = Depends(require_approved)) -> dict[str, Any]: + model_id = make_id("model") + now = utc_now() + if request.is_default: + db.execute("UPDATE model_profiles SET is_default = 0 WHERE owner_account_id = ?", (account["id"],)) + db.execute( + """ + INSERT INTO model_profiles (id, owner_account_id, name, provider, base_url, api_key, model_name, is_system, is_default, created_at, updated_at) + VALUES (?, ?, ?, 'openai_compat', ?, ?, ?, 0, ?, ?, ?) + """, + (model_id, account["id"], request.name.strip(), request.base_url.strip(), request.api_key.strip(), request.model_name.strip(), 1 if request.is_default else 0, now, now), + ) + row = db.fetch_one("SELECT * FROM model_profiles WHERE id = ?", (model_id,)) + return normalize_model_profile(row) + + +@app.post("/v2/me/preferences/analysis-model") +def set_preferred_analysis_model(request: PreferredModelRequest, account: dict[str, Any] = Depends(require_approved)) -> dict[str, Any]: + model = db.fetch_one( + "SELECT * FROM model_profiles WHERE id = ? AND (owner_account_id IS NULL OR owner_account_id = ?)", + (request.model_profile_id, account["id"]), + ) + if not model: + raise HTTPException(status_code=404, detail="Model profile not found") + db.execute( + "UPDATE accounts SET preferred_analysis_model_id = ?, updated_at = ? WHERE id = ?", + (request.model_profile_id, utc_now(), account["id"]), + ) + account = db.fetch_one("SELECT * FROM accounts WHERE id = ?", (account["id"],)) + return normalize_account(account) + + +@app.get("/v2/knowledge-bases") +def list_knowledge_bases(account: dict[str, Any] = Depends(require_approved)) -> list[dict[str, Any]]: + return [knowledge_base_payload(row) for row in db.fetch_all("SELECT * FROM knowledge_bases WHERE user_id = ? ORDER BY created_at DESC", (account["id"],))] + + +@app.post("/v2/knowledge-bases") +def create_knowledge_base(request: KnowledgeBaseCreateRequest, account: dict[str, Any] = Depends(require_approved)) -> dict[str, Any]: + project = resolve_target_project(account["id"], request.project_id or None, username=account["username"]) + kb_id = make_id("kb") + now = utc_now() + db.execute( + """ + INSERT INTO knowledge_bases (id, user_id, project_id, name, description, sync_status, created_at, updated_at) + VALUES (?, ?, ?, ?, ?, 'ready', ?, ?) + """, + (kb_id, account["id"], project["id"], request.name.strip(), request.description.strip(), now, now), + ) + row = db.fetch_one("SELECT * FROM knowledge_bases WHERE id = ?", (kb_id,)) + return knowledge_base_payload(row) + + +@app.get("/v2/knowledge-bases/{knowledge_base_id}/documents") +def list_knowledge_documents(knowledge_base_id: str, account: dict[str, Any] = Depends(require_approved)) -> list[dict[str, Any]]: + kb = db.fetch_one("SELECT * FROM knowledge_bases WHERE id = ? AND user_id = ?", (knowledge_base_id, account["id"])) + if not kb: + raise HTTPException(status_code=404, detail="Knowledge base not found") + rows = db.fetch_all("SELECT * FROM knowledge_documents WHERE knowledge_base_id = ? ORDER BY created_at DESC", (knowledge_base_id,)) + return [document_payload(row) for row in rows] + + +@app.get("/v2/explore/jobs") +def list_jobs( + parent_job_id: str | None = Query(default=None), + line_type: str | None = Query(default=None), + account: dict[str, Any] = Depends(require_approved), +) -> list[dict[str, Any]]: + clauses = ["user_id = ?"] + params: list[Any] = [account["id"]] + if parent_job_id is not None: + normalized_parent = parent_job_id.strip() + if normalized_parent: + clauses.append("parent_job_id = ?") + params.append(normalized_parent) + else: + clauses.append("(parent_job_id IS NULL OR parent_job_id = '')") + if line_type: + clauses.append("line_type = ?") + params.append(line_type.strip()) + sql = f"SELECT * FROM jobs WHERE {' AND '.join(clauses)} ORDER BY created_at DESC" + return [job_payload(row) for row in db.fetch_all(sql, tuple(params))] + + +@app.get("/v2/explore/jobs/{job_id}") +def get_job(job_id: str, account: dict[str, Any] = Depends(require_approved)) -> dict[str, Any]: + row = db.fetch_one("SELECT * FROM jobs WHERE id = ? AND user_id = ?", (job_id, account["id"])) + if not row: + raise HTTPException(status_code=404, detail="Job not found") + return job_payload(row) + + +@app.get("/v2/explore/jobs/{job_id}/events") +def get_job_events(job_id: str, account: dict[str, Any] = Depends(require_approved)) -> list[dict[str, Any]]: + row = db.fetch_one("SELECT id FROM jobs WHERE id = ? AND user_id = ?", (job_id, account["id"])) + if not row: + raise HTTPException(status_code=404, detail="Job not found") + return [ + job_event_payload(item) + for item in db.fetch_all("SELECT * FROM job_events WHERE job_id = ? ORDER BY created_at ASC", (job_id,)) + ] + + +def resolve_target_kb(account_id: str, requested_kb_id: str | None, project_id: str = "", username: str = "默认用户") -> dict[str, Any]: + if requested_kb_id: + kb = db.fetch_one("SELECT * FROM knowledge_bases WHERE id = ? AND user_id = ?", (requested_kb_id, account_id)) + if kb: + if project_id and kb.get("project_id") and kb.get("project_id") != project_id: + raise HTTPException(status_code=400, detail="Knowledge base does not belong to target project") + return kb + raise HTTPException(status_code=404, detail="Knowledge base not found") + return ensure_user_kb(account_id, project_id, username=username) + + +@app.post("/v2/explore/text") +async def create_text_job(request: ExploreTextRequest, account: dict[str, Any] = Depends(require_approved)) -> dict[str, Any]: + project = resolve_target_project(account["id"], request.project_id or None, username=account["username"]) + kb = resolve_target_kb(account["id"], request.knowledge_base_id, project["id"], username=account["username"]) + assistant = resolve_target_assistant(account["id"], request.assistant_id, project["id"]) + profile = model_profile_for_account(account["id"], request.analysis_model_profile_id) + source = create_content_source( + account_id=account["id"], + project_id=project["id"], + source_kind="inline_text", + title=request.title.strip(), + metadata={"content_preview": request.content[:280]}, + ) + job_row = create_job_record( + account_id=account["id"], + project_id=project["id"], + knowledge_base_id=kb["id"], + source_type="text", + line_type="analysis", + workflow_key="analysis_pipeline", + title=request.title.strip(), + language="zh-CN", + assistant_id=(assistant or {}).get("id"), + content_source_id=source["id"], + artifacts={"input_text": request.content}, + analysis_model_profile_id=profile["id"], + ) + return job_payload(await trigger_orchestrated_job(job_row)) + + +@app.post("/v2/explore/video-link") +async def create_video_link_job(request: ExploreVideoLinkRequest, account: dict[str, Any] = Depends(require_approved)) -> dict[str, Any]: + project = resolve_target_project(account["id"], request.project_id or None, username=account["username"]) + kb = resolve_target_kb(account["id"], request.knowledge_base_id, project["id"], username=account["username"]) + assistant = resolve_target_assistant(account["id"], request.assistant_id, project["id"]) + profile = model_profile_for_account(account["id"], request.analysis_model_profile_id) + source = create_content_source( + account_id=account["id"], + project_id=project["id"], + source_kind="video_link", + source_url=request.video_url.strip(), + title=(request.title or "短视频素材").strip(), + metadata={"platform": "video_link"}, + ) + job_row = create_job_record( + account_id=account["id"], + project_id=project["id"], + knowledge_base_id=kb["id"], + source_type="video_link", + line_type="analysis", + workflow_key="analysis_pipeline", + title=(request.title or "短视频素材").strip(), + language=request.language, + source_url=request.video_url.strip(), + assistant_id=(assistant or {}).get("id"), + content_source_id=source["id"], + artifacts={}, + analysis_model_profile_id=profile["id"], + ) + return job_payload(await trigger_orchestrated_job(job_row)) + + +@app.post("/v2/explore/upload-video") +async def upload_video( + file: UploadFile = File(...), + title: str = Form(""), + project_id: str = Form(""), + knowledge_base_id: str = Form(""), + assistant_id: str = Form(""), + analysis_model_profile_id: str = Form(""), + account: dict[str, Any] = Depends(require_approved), +) -> dict[str, Any]: + project = resolve_target_project(account["id"], project_id or None, username=account["username"]) + kb = resolve_target_kb(account["id"], knowledge_base_id or None, project["id"], username=account["username"]) + assistant = resolve_target_assistant(account["id"], assistant_id or None, project["id"]) + profile = model_profile_for_account(account["id"], analysis_model_profile_id or None) + job_id = make_id("job_upload") + job_dir = JOBS_DIR / job_id + job_dir.mkdir(parents=True, exist_ok=True) + suffix = Path(file.filename or "upload.mp4").suffix or ".mp4" + target_path = job_dir / f"source{suffix}" + with target_path.open("wb") as handle: + shutil.copyfileobj(file.file, handle) + source = create_content_source( + account_id=account["id"], + project_id=project["id"], + source_kind="upload_video", + source_url=file.filename or "", + title=(title or file.filename or "上传视频素材").strip(), + local_path=str(target_path), + metadata={"filename": file.filename or "", "size_bytes": target_path.stat().st_size}, + ) + job_row = create_job_record( + account_id=account["id"], + project_id=project["id"], + knowledge_base_id=kb["id"], + source_type="upload_video", + line_type="analysis", + workflow_key="analysis_pipeline", + title=(title or file.filename or "上传视频素材").strip(), + source_url=file.filename or "", + assistant_id=(assistant or {}).get("id"), + content_source_id=source["id"], + artifacts={"uploaded_path": str(target_path)}, + analysis_model_profile_id=profile["id"], + ) + return job_payload(await trigger_orchestrated_job(job_row)) + + +@app.post("/v2/pipelines/real-cut") +async def create_real_cut_job(request: RealCutJobRequest, account: dict[str, Any] = Depends(require_approved)) -> dict[str, Any]: + source_job = None + source_job_id = request.source_job_id.strip() + if source_job_id: + source_job = load_owned_job(source_job_id, account["id"]) + + requested_project_id = request.project_id or (source_job.get("project_id", "") if source_job else "") + project = resolve_target_project(account["id"], requested_project_id or None, username=account["username"]) + if source_job and source_job.get("project_id") and source_job.get("project_id") != project["id"]: + raise HTTPException(status_code=400, detail="Source job does not belong to target project") + + kb = ensure_user_kb(account["id"], project["id"], username=account["username"]) + resolved_input_dir = request.input_dir.strip() + staged_payload: dict[str, Any] = {} + if not resolved_input_dir: + if not source_job: + raise HTTPException(status_code=400, detail="input_dir or source_job_id is required") + staged_payload = await stage_real_cut_source_to_cutvideo(source_job) + resolved_input_dir = staged_payload["input_dir"] + + source_url = resolved_input_dir + source_metadata: dict[str, Any] = {"line_type": "real_cut"} + if source_job: + source_url = source_job.get("source_url") or resolved_input_dir + source_metadata["source_job_id"] = source_job["id"] + source_metadata["source_job_type"] = source_job.get("source_type", "") + if staged_payload: + source_metadata["cutvideo_upload"] = staged_payload.get("upload", {}) + source_metadata["source_media_path"] = staged_payload.get("source_path", "") + + source = create_content_source( + account_id=account["id"], + project_id=project["id"], + source_kind="real_cut_input", + title=request.title.strip(), + source_url=source_url, + local_path=resolved_input_dir, + metadata=source_metadata, + ) + job_row = create_job_record( + account_id=account["id"], + project_id=project["id"], + knowledge_base_id=kb["id"], + source_type="real_cut", + line_type="real_cut", + workflow_key="real_cut_pipeline", + title=request.title.strip(), + source_url=resolved_input_dir, + content_source_id=source["id"], + artifacts={ + "source_job_id": source_job["id"] if source_job else "", + "source_media_path": staged_payload.get("source_path", ""), + "cutvideo_upload": staged_payload.get("upload", {}), + "cutvideo_request": { + "base_config": request.base_config.strip() or CUTVIDEO_BASE_CONFIG, + "name": request.title.strip(), + "input_dir": resolved_input_dir, + "objective": request.objective, + "target_duration_sec": request.target_duration_sec, + "target_aspect_ratio": request.target_aspect_ratio, + "ideal_segment_duration_sec": request.ideal_segment_duration_sec, + "max_segment_duration_sec": request.max_segment_duration_sec, + "transcript_backend": request.transcript_backend, + "transcript_device": request.transcript_device, + "review_enabled": request.review_enabled, + "dry_run": request.dry_run, + } + }, + ) + return job_payload(await trigger_orchestrated_job(job_row)) + + +@app.post("/v2/pipelines/ai-video") +async def create_ai_video_job(request: AiVideoJobRequest, account: dict[str, Any] = Depends(require_approved)) -> dict[str, Any]: + source_job = None + source_project_id = "" + source_kb_id = "" + if request.source_job_id.strip(): + source_job = db.fetch_one("SELECT * FROM jobs WHERE id = ? AND user_id = ?", (request.source_job_id.strip(), account["id"])) + if not source_job: + raise HTTPException(status_code=404, detail="Source job not found") + if source_job["status"] != "completed": + raise HTTPException(status_code=409, detail="Source job must be completed before AI video generation") + source_project_id = source_job.get("project_id", "") + source_kb_id = source_job.get("knowledge_base_id", "") + + requested_project_id = request.project_id or source_project_id + project = resolve_target_project(account["id"], requested_project_id or None, username=account["username"]) + kb = resolve_target_kb(account["id"], request.knowledge_base_id or source_kb_id or None, project["id"], username=account["username"]) + assistant = resolve_target_assistant(account["id"], request.assistant_id or None, project["id"]) + source = create_content_source( + account_id=account["id"], + project_id=project["id"], + source_kind="ai_video_brief", + title=request.title.strip(), + metadata={"source_job_id": request.source_job_id.strip()}, + ) + job_row = create_job_record( + account_id=account["id"], + project_id=project["id"], + knowledge_base_id=kb["id"], + source_type="ai_video", + line_type="ai_video", + workflow_key="ai_video_pipeline", + title=request.title.strip(), + assistant_id=(assistant or {}).get("id"), + content_source_id=source["id"], + artifacts={ + "brief": request.brief, + "style": request.style, + "shots": request.shots, + "image_provider": request.image_provider, + "image_model": request.image_model, + "video_provider": request.video_provider, + "video_model": request.video_model, + "aspect_ratio": request.aspect_ratio, + "duration": request.duration, + "source_job_id": request.source_job_id.strip(), + }, + ) + return job_payload(await trigger_orchestrated_job(job_row)) + + +@app.get("/v2/assistants") +def list_assistants(account: dict[str, Any] = Depends(require_approved)) -> list[dict[str, Any]]: + return [assistant_payload(row) for row in db.fetch_all("SELECT * FROM assistants WHERE user_id = ? ORDER BY created_at DESC", (account["id"],))] + + +@app.post("/v2/assistants") +def create_assistant(request: AssistantCreateRequest, account: dict[str, Any] = Depends(require_approved)) -> dict[str, Any]: + assistant_id = make_id("assistant") + now = utc_now() + project = resolve_target_project(account["id"], request.project_id or None, username=account["username"]) + model_profile = model_profile_for_account(account["id"], request.model_profile_id or None) + db.execute( + """ + INSERT INTO assistants (id, user_id, project_id, name, description, system_prompt, generation_goal, config_json, model_profile_id, created_at, updated_at) + VALUES (?, ?, ?, ?, ?, ?, ?, '{}', ?, ?, ?) + """, + ( + assistant_id, + account["id"], + project["id"], + request.name.strip(), + request.description.strip(), + request.system_prompt.strip(), + request.generation_goal.strip(), + model_profile["id"], + now, + now, + ), + ) + for kb_id in request.knowledge_base_ids: + kb = db.fetch_one("SELECT id FROM knowledge_bases WHERE id = ? AND user_id = ?", (kb_id, account["id"])) + if kb: + db.execute("INSERT OR IGNORE INTO assistant_knowledge_bases (assistant_id, knowledge_base_id) VALUES (?, ?)", (assistant_id, kb_id)) + return assistant_payload(db.fetch_one("SELECT * FROM assistants WHERE id = ?", (assistant_id,))) + + +@app.patch("/v2/assistants/{assistant_id}") +def update_assistant(assistant_id: str, request: AssistantUpdateRequest, account: dict[str, Any] = Depends(require_approved)) -> dict[str, Any]: + current = db.fetch_one("SELECT * FROM assistants WHERE id = ? AND user_id = ?", (assistant_id, account["id"])) + if not current: + raise HTTPException(status_code=404, detail="Assistant not found") + project_id = current.get("project_id", "") + if request.project_id is not None: + project_id = resolve_target_project(account["id"], request.project_id, username=account["username"])["id"] + payload = { + "name": request.name if request.name is not None else current["name"], + "description": request.description if request.description is not None else current.get("description", ""), + "system_prompt": request.system_prompt if request.system_prompt is not None else current.get("system_prompt", ""), + "generation_goal": request.generation_goal if request.generation_goal is not None else current.get("generation_goal", ""), + "project_id": project_id, + "model_profile_id": current.get("model_profile_id", ""), + } + if request.model_profile_id is not None: + payload["model_profile_id"] = model_profile_for_account(account["id"], request.model_profile_id)["id"] + db.execute( + """ + UPDATE assistants + SET project_id = ?, name = ?, description = ?, system_prompt = ?, generation_goal = ?, model_profile_id = ?, updated_at = ? + WHERE id = ? + """, + ( + payload["project_id"], + payload["name"], + payload["description"], + payload["system_prompt"], + payload["generation_goal"], + payload["model_profile_id"], + utc_now(), + assistant_id, + ), + ) + if request.knowledge_base_ids is not None: + db.execute("DELETE FROM assistant_knowledge_bases WHERE assistant_id = ?", (assistant_id,)) + for kb_id in request.knowledge_base_ids: + kb = db.fetch_one("SELECT id FROM knowledge_bases WHERE id = ? AND user_id = ?", (kb_id, account["id"])) + if kb: + db.execute("INSERT OR IGNORE INTO assistant_knowledge_bases (assistant_id, knowledge_base_id) VALUES (?, ?)", (assistant_id, kb_id)) + return assistant_payload(db.fetch_one("SELECT * FROM assistants WHERE id = ?", (assistant_id,))) + + +@app.get("/v2/agents") +def list_agents(account: dict[str, Any] = Depends(require_approved)) -> list[dict[str, Any]]: + return list_assistants(account) + + +@app.post("/v2/agents") +def create_agent(request: AssistantCreateRequest, account: dict[str, Any] = Depends(require_approved)) -> dict[str, Any]: + return create_assistant(request, account) + + +@app.patch("/v2/agents/{assistant_id}") +def update_agent(assistant_id: str, request: AssistantUpdateRequest, account: dict[str, Any] = Depends(require_approved)) -> dict[str, Any]: + return update_assistant(assistant_id, request, account) + + +@app.post("/v2/assistants/{assistant_id}/generate") +async def generate_copy(assistant_id: str, request: GenerateCopyRequest, account: dict[str, Any] = Depends(require_approved)) -> dict[str, Any]: + assistant = db.fetch_one("SELECT * FROM assistants WHERE id = ? AND user_id = ?", (assistant_id, account["id"])) + if not assistant: + raise HTTPException(status_code=404, detail="Assistant not found") + kb_ids = request.knowledge_base_ids or [row["knowledge_base_id"] for row in db.fetch_all("SELECT knowledge_base_id FROM assistant_knowledge_bases WHERE assistant_id = ?", (assistant_id,))] + used_documents: list[dict[str, Any]] = [] + excerpts: list[str] = [] + for kb_id in kb_ids: + docs = db.fetch_all("SELECT * FROM knowledge_documents WHERE knowledge_base_id = ? ORDER BY created_at DESC LIMIT 3", (kb_id,)) + for doc in docs: + payload = document_payload(doc) + used_documents.append(payload) + excerpt = payload["combined_text"] or payload["style_summary"] or payload["transcript_text"] + excerpts.append(f"[{payload['title']}]\n{excerpt[:1200]}") + prompt_excerpt = "\n\n".join(excerpts)[:6000] + system_prompt = assistant.get("system_prompt") or "你是文案助手。" + generation_goal = assistant.get("generation_goal") or "生成短视频文案。" + user_prompt = ( + f"任务目标:{generation_goal}\n" + f"创作需求:{request.brief}\n" + f"平台:{request.platform}\n" + f"受众:{request.audience}\n" + f"额外要求:{request.extra_requirements or '无'}\n\n" + f"参考知识库素材:\n{prompt_excerpt or '暂无参考素材,请按通用短视频结构输出。'}\n\n" + "请输出完整文案,包含标题、开场钩子、正文结构和结尾行动指令。" + ) + profile = model_profile_for_account(account["id"], assistant.get("model_profile_id") or None) + content = await call_model(profile, system_prompt, user_prompt, temperature=0.7) + return { + "assistant_id": assistant_id, + "knowledge_base_ids": kb_ids, + "content": content, + "prompt_excerpt": prompt_excerpt[:2000], + "used_documents": used_documents, + } + + +def load_owned_job(job_id: str, account_id: str) -> dict[str, Any]: + row = db.fetch_one("SELECT * FROM jobs WHERE id = ? AND user_id = ?", (job_id, account_id)) + if not row: + raise HTTPException(status_code=404, detail="Job not found") + return row + + +def load_owned_content_source(source_id: str, account_id: str) -> dict[str, Any]: + row = db.fetch_one("SELECT * FROM content_sources WHERE id = ? AND user_id = ?", (source_id, account_id)) + if not row: + raise HTTPException(status_code=404, detail="Content source not found") + return row + + +def load_internal_job(job_id: str) -> dict[str, Any]: + row = db.fetch_one("SELECT * FROM jobs WHERE id = ?", (job_id,)) + if not row: + raise HTTPException(status_code=404, detail="Job not found") + return row + + +def parse_job_artifacts(row: dict[str, Any]) -> dict[str, Any]: + raw = row.get("artifacts_json") or "{}" + try: + return json.loads(raw) + except json.JSONDecodeError: + return {} + + +def parse_job_result(row: dict[str, Any]) -> dict[str, Any]: + raw = row.get("result_json") or "{}" + try: + data = json.loads(raw) + return data if isinstance(data, dict) else {} + except json.JSONDecodeError: + return {} + + +def extract_source_storyboards(source_job: dict[str, Any] | None) -> list[dict[str, Any]]: + if not source_job: + return [] + return coerce_storyboards(parse_job_result(source_job).get("storyboards")) + + +def resolve_internal_job_id(request: InternalStepRequest | None, query_job_id: str = "") -> str: + resolved = (query_job_id or "").strip() + if not resolved and request is not None: + resolved = ( + request.job_id + or request.jobId + or str(request.payload.get("job_id") or request.payload.get("jobId") or "") + ).strip() + return resolved + + +def load_step_job(request: InternalStepRequest | None, query_job_id: str, workflow_key: str) -> dict[str, Any]: + resolved_job_id = resolve_internal_job_id(request, query_job_id) + if resolved_job_id: + return load_internal_job(resolved_job_id) + row = db.fetch_one( + """ + SELECT * FROM jobs + WHERE workflow_key = ? AND status IN ('pending', 'queued') + ORDER BY created_at ASC + LIMIT 1 + """, + (workflow_key,), + ) + if not row: + raise HTTPException(status_code=400, detail="job_id is required") + return row + + +@app.get("/internal/jobs/{job_id}/context") +def internal_job_context(job_id: str, _: bool = Depends(require_orchestrator)) -> dict[str, Any]: + return job_context_payload(load_internal_job(job_id)) + + +@app.post("/internal/jobs/steps/analyze") +async def internal_run_analysis( + request: InternalStepRequest | None = Body(default=None), + job_id: str = Query(default=""), + _: bool = Depends(require_orchestrator), +) -> dict[str, Any]: + row = load_step_job(request, job_id, "analysis_pipeline") + await process_job(row["id"]) + return job_context_payload(load_internal_job(row["id"])) + + +@app.post("/internal/jobs/steps/content-source-sync") +async def internal_content_source_sync( + request: InternalStepRequest | None = Body(default=None), + job_id: str = Query(default=""), + _: bool = Depends(require_orchestrator), +) -> dict[str, Any]: + row = load_step_job(request, job_id, "content_source_sync_pipeline") + artifacts = parse_job_artifacts(row) + source_url = str(artifacts.get("source_account_url") or row.get("source_url") or "").strip() + if not source_url: + raise HTTPException(status_code=400, detail="Content source sync job is missing source URL") + max_items = max(1, min(int(artifacts.get("max_items") or 5), 20)) + skip_existing = bool(artifacts.get("skip_existing", True)) + auto_trigger_analysis = bool(artifacts.get("auto_trigger_analysis", True)) + + update_job_state( + row["id"], + status="processing", + provider_name="collector", + provider_task_id=row["id"], + result={"sync_started": True}, + ) + + try: + discovered_items, debug_payload = discover_account_video_links(source_url, max_items) + child_jobs: list[dict[str, Any]] = [] + queued_jobs: list[dict[str, Any]] = [] + skipped_items: list[dict[str, Any]] = [] + + for index, item in enumerate(discovered_items, start=1): + video_url = str(item.get("video_url") or "").strip() + if not video_url: + continue + existing_row = db.fetch_one( + """ + SELECT * FROM jobs + WHERE user_id = ? AND project_id = ? AND source_type = 'video_link' AND source_url = ? + ORDER BY created_at DESC + LIMIT 1 + """, + (row["user_id"], row.get("project_id", ""), video_url), + ) + if existing_row and skip_existing: + skipped_items.append( + { + "video_url": video_url, + "title": item.get("title") or existing_row.get("title") or "短视频素材", + "existing_job_id": existing_row["id"], + "existing_status": existing_row.get("status", ""), + } + ) + continue + + content_source = create_content_source( + account_id=row["user_id"], + project_id=row.get("project_id", ""), + source_kind="video_link", + platform=str(artifacts.get("platform") or infer_platform_from_url(video_url)), + handle=str(artifacts.get("handle") or ""), + source_url=video_url, + title=str(item.get("title") or f"内容源视频 {index}"), + metadata={ + "origin_content_source_id": row.get("content_source_id", ""), + "origin_sync_job_id": row["id"], + "external_id": str(item.get("external_id") or ""), + "source_account_url": source_url, + }, + ) + child_row = create_job_record( + account_id=row["user_id"], + project_id=row.get("project_id", ""), + parent_job_id=row["id"], + knowledge_base_id=row["knowledge_base_id"], + source_type="video_link", + line_type="analysis", + workflow_key="analysis_pipeline", + title=str(item.get("title") or f"内容源视频 {index}"), + language=row.get("language", "auto"), + source_url=video_url, + assistant_id=row.get("assistant_id"), + content_source_id=content_source["id"], + artifacts={ + "origin_content_source_id": row.get("content_source_id", ""), + "origin_sync_job_id": row["id"], + "source_account_url": source_url, + }, + analysis_model_profile_id=row.get("analysis_model_profile_id", ""), + ) + child_jobs.append(job_payload(child_row)) + if auto_trigger_analysis: + queued_child = await trigger_orchestrated_job(child_row) + queued_jobs.append(job_payload(queued_child)) + + if row.get("content_source_id"): + update_content_source_metadata( + row["content_source_id"], + { + "last_sync_job_id": row["id"], + "last_sync_completed_at": utc_now(), + "last_discovered_count": len(discovered_items), + "last_enqueued_job_ids": [item["id"] for item in queued_jobs] or [item["id"] for item in child_jobs], + "last_skipped_existing_count": len(skipped_items), + "last_source_account_url": source_url, + "last_sync_error": "", + }, + ) + + updated = update_job_state( + row["id"], + status="completed", + provider_name="collector", + provider_task_id=row["id"], + artifacts={ + **debug_payload, + "discovered_videos": discovered_items, + "skipped_existing": skipped_items, + "child_job_ids": [item["id"] for item in child_jobs], + "queued_job_ids": [item["id"] for item in queued_jobs], + }, + result={ + "discovered_count": len(discovered_items), + "queued_count": len(queued_jobs) if auto_trigger_analysis else len(child_jobs), + "skipped_count": len(skipped_items), + "child_jobs": queued_jobs or child_jobs, + "skipped_existing": skipped_items, + }, + ) + return job_context_payload(updated) + except HTTPException as exc: + error = str(exc.detail) + except Exception as exc: + error = str(exc) + + if row.get("content_source_id"): + update_content_source_metadata( + row["content_source_id"], + { + "last_sync_job_id": row["id"], + "last_sync_completed_at": utc_now(), + "last_sync_error": error[:500], + "last_source_account_url": source_url, + }, + ) + updated = update_job_state( + row["id"], + status="failed", + error=error[:500], + provider_name="collector", + provider_task_id=row["id"], + ) + return job_context_payload(updated) + + +@app.post("/internal/jobs/steps/real-cut/submit") +async def internal_real_cut_submit( + request: InternalStepRequest | None = Body(default=None), + job_id: str = Query(default=""), + _: bool = Depends(require_orchestrator), +) -> dict[str, Any]: + if not cutvideo_client.enabled: + raise HTTPException(status_code=503, detail="CutVideo is not configured") + row = load_step_job(request, job_id, "real_cut_pipeline") + artifacts = parse_job_artifacts(row) + cutvideo_request = artifacts.get("cutvideo_request") or {} + if not isinstance(cutvideo_request, dict): + raise HTTPException(status_code=400, detail="Invalid cutvideo request payload") + append_job_event(row["id"], "cutvideo.submit.requested", cutvideo_request) + submit_result = await cutvideo_client.submit_job(cutvideo_request) + task_id = str(submit_result.get("task_id") or "") + updated = update_job_state( + row["id"], + status="processing", + provider_name="cutvideo", + provider_task_id=task_id, + result={"cutvideo_submit": submit_result}, + ) + return job_context_payload(updated) + + +@app.post("/internal/jobs/steps/real-cut/poll") +async def internal_real_cut_poll( + request: InternalStepRequest | None = Body(default=None), + job_id: str = Query(default=""), + _: bool = Depends(require_orchestrator), +) -> dict[str, Any]: + row = load_step_job(request, job_id, "real_cut_pipeline") + if not row.get("provider_task_id"): + raise HTTPException(status_code=409, detail="CutVideo task has not been submitted") + task_payload = await cutvideo_client.get_task(row["provider_task_id"]) + status = str(task_payload.get("status") or "").lower() + run_payload: dict[str, Any] = {} + artifacts: dict[str, Any] = {"cutvideo_task": task_payload} + next_status = row["status"] + error = row.get("error", "") + if status == "completed": + next_status = "completed" + run_id = str(task_payload.get("run_id") or "") + if run_id: + run_payload = await cutvideo_client.get_run(run_id) + artifacts["cutvideo_run"] = run_payload + elif status == "failed": + next_status = "failed" + error = str(task_payload.get("error") or "CutVideo task failed") + else: + next_status = "processing" + + updated = update_job_state( + row["id"], + status=next_status, + error=error, + provider_name="cutvideo", + provider_task_id=row["provider_task_id"], + artifacts=artifacts, + result={"cutvideo_run": run_payload} if run_payload else {"cutvideo_task": task_payload}, + ) + return job_context_payload(updated) + + +@app.post("/internal/jobs/steps/real-cut/run") +async def internal_real_cut_run( + request: InternalStepRequest | None = Body(default=None), + job_id: str = Query(default=""), + _: bool = Depends(require_orchestrator), +) -> dict[str, Any]: + if not cutvideo_client.enabled: + raise HTTPException(status_code=503, detail="CutVideo is not configured") + + row = load_step_job(request, job_id, "real_cut_pipeline") + if not row.get("provider_task_id"): + artifacts = parse_job_artifacts(row) + cutvideo_request = artifacts.get("cutvideo_request") or {} + if not isinstance(cutvideo_request, dict): + raise HTTPException(status_code=400, detail="Invalid cutvideo request payload") + submit_result = await cutvideo_client.submit_job(cutvideo_request) + row = update_job_state( + row["id"], + status="processing", + provider_name="cutvideo", + provider_task_id=str(submit_result.get("task_id") or ""), + result={"cutvideo_submit": submit_result}, + ) + + deadline = now_ts() + HUOBAO_MAX_WAIT_SEC + while True: + task_payload = await cutvideo_client.get_task(row["provider_task_id"]) + status = str(task_payload.get("status") or "").lower() + if status == "completed": + run_payload: dict[str, Any] = {} + run_id = str(task_payload.get("run_id") or "") + if run_id: + run_payload = await cutvideo_client.get_run(run_id) + updated = update_job_state( + row["id"], + status="completed", + provider_name="cutvideo", + provider_task_id=row["provider_task_id"], + artifacts={"cutvideo_task": task_payload, "cutvideo_run": run_payload}, + result={"cutvideo_task": task_payload, "cutvideo_run": run_payload}, + ) + return job_context_payload(updated) + if status == "failed": + updated = update_job_state( + row["id"], + status="failed", + error=str(task_payload.get("error") or "CutVideo task failed"), + provider_name="cutvideo", + provider_task_id=row["provider_task_id"], + artifacts={"cutvideo_task": task_payload}, + result={"cutvideo_task": task_payload}, + ) + return job_context_payload(updated) + if now_ts() >= deadline: + updated = update_job_state( + row["id"], + status="failed", + error="CutVideo task timed out", + provider_name="cutvideo", + provider_task_id=row["provider_task_id"], + artifacts={"cutvideo_task": task_payload}, + result={"cutvideo_task": task_payload}, + ) + return job_context_payload(updated) + await asyncio.sleep(CUTVIDEO_POLL_INTERVAL_SEC) + row = load_internal_job(row["id"]) + + +@app.post("/internal/jobs/steps/ai-video/render") +async def internal_ai_video_render( + request: InternalStepRequest | None = Body(default=None), + job_id: str = Query(default=""), + _: bool = Depends(require_orchestrator), +) -> dict[str, Any]: + if not huobao_client.enabled: + raise HTTPException(status_code=503, detail="Huobao is not configured") + + row = load_step_job(request, job_id, "ai_video_pipeline") + artifacts = parse_job_artifacts(row) + assistant = db.fetch_one("SELECT * FROM assistants WHERE id = ?", (row["assistant_id"],)) if row.get("assistant_id") else None + source_job = None + source_storyboards: list[dict[str, Any]] = [] + source_job_id = str(artifacts.get("source_job_id") or "").strip() + if source_job_id: + source_job = db.fetch_one("SELECT * FROM jobs WHERE id = ? AND user_id = ?", (source_job_id, row["user_id"])) + if source_job: + source_storyboards = extract_source_storyboards(source_job) + + if source_storyboards: + storyboard_items = source_storyboards[: max(int(artifacts.get("shots") or 4), 1)] + else: + profile = model_profile_for_account(row["user_id"], row.get("analysis_model_profile_id") or None) + blueprint = await generate_content_blueprint( + profile, + title=row["title"], + transcript_text=str(artifacts.get("brief") or row["title"]), + style_summary=str(artifacts.get("style") or ""), + agent_prompt=(assistant or {}).get("system_prompt", ""), + generation_goal=(assistant or {}).get("generation_goal", "") or "生成适合视频模型的分镜与提示词", + ) + storyboard_items = coerce_storyboards(blueprint.get("storyboards"))[: max(int(artifacts.get("shots") or 4), 1)] + + if not storyboard_items: + raise HTTPException(status_code=400, detail="No storyboards available for AI video rendering") + + drama_payload = await huobao_client.create_drama( + { + "title": row["title"], + "description": str(artifacts.get("brief") or row["title"]), + "style": str(artifacts.get("style") or "realistic"), + "genre": "short_video", + "tags": "storyforge", + } + ) + drama_id = str(drama_payload.get("id") or "") + if not drama_id: + raise RuntimeError("Huobao did not return drama id") + + update_job_state( + row["id"], + status="processing", + provider_name="huobao-drama", + provider_task_id=drama_id, + result={"huobao_drama": drama_payload}, + ) + + rendered_scenes: list[dict[str, Any]] = [] + image_provider = str(artifacts.get("image_provider") or "openai") + image_model = str(artifacts.get("image_model") or "") + video_provider = str(artifacts.get("video_provider") or "doubao") + video_model = str(artifacts.get("video_model") or "") + aspect_ratio = str(artifacts.get("aspect_ratio") or "9:16") + image_size = huobao_image_size_for_aspect_ratio(aspect_ratio) + duration = int(artifacts.get("duration") or 5) + style = str(artifacts.get("style") or "realistic") + + for idx, storyboard in enumerate(storyboard_items, start=1): + first_prompt = str(storyboard.get("first_frame_prompt") or storyboard.get("visual") or storyboard.get("title") or row["title"]) + last_prompt = str(storyboard.get("last_frame_prompt") or storyboard.get("visual") or storyboard.get("title") or row["title"]) + video_prompt = str(storyboard.get("video_prompt") or storyboard.get("narration") or storyboard.get("title") or row["title"]) + + first_image = await huobao_client.generate_image( + { + "drama_id": drama_id, + "image_type": "storyboard", + "frame_type": "first", + "prompt": first_prompt, + "provider": image_provider, + "model": image_model, + "size": image_size, + "style": style, + } + ) + last_image = await huobao_client.generate_image( + { + "drama_id": drama_id, + "image_type": "storyboard", + "frame_type": "last", + "prompt": last_prompt, + "provider": image_provider, + "model": image_model, + "size": image_size, + "style": style, + } + ) + + first_ready = await wait_for_huobao_image(str(first_image.get("id") or "")) + last_ready = await wait_for_huobao_image(str(last_image.get("id") or "")) + if str(first_ready.get("status") or "").lower() != "completed": + raise RuntimeError(f"First frame generation failed for scene {idx}") + if str(last_ready.get("status") or "").lower() != "completed": + raise RuntimeError(f"Last frame generation failed for scene {idx}") + + first_frame_url = first_ready.get("image_url") or first_ready.get("local_path") + last_frame_url = last_ready.get("image_url") or last_ready.get("local_path") + if not first_frame_url or not last_frame_url: + raise RuntimeError(f"Huobao image output missing for scene {idx}") + + video_payload = await huobao_client.generate_video( + { + "drama_id": drama_id, + "prompt": video_prompt, + "provider": video_provider, + "model": video_model, + "reference_mode": "first_last", + "first_frame_url": first_frame_url, + "last_frame_url": last_frame_url, + "aspect_ratio": aspect_ratio, + "duration": duration, + "style": style, + } + ) + video_ready = await wait_for_huobao_video(str(video_payload.get("id") or "")) + if str(video_ready.get("status") or "").lower() != "completed": + raise RuntimeError(f"Video generation failed for scene {idx}") + + rendered_scenes.append( + { + "shot_index": storyboard.get("shot_index", idx), + "title": storyboard.get("title", f"镜头{idx}"), + "narration": storyboard.get("narration", ""), + "first_frame": first_ready, + "last_frame": last_ready, + "video": video_ready, + } + ) + + updated = update_job_state( + row["id"], + status="completed", + provider_name="huobao-drama", + provider_task_id=drama_id, + artifacts={ + "huobao_drama_id": drama_id, + "source_job_id": source_job_id, + }, + result={ + "huobao_drama": drama_payload, + "rendered_scenes": rendered_scenes, + "storyboards": storyboard_items, + }, + ) + return job_context_payload(updated) + + +@app.post("/internal/jobs/{job_id}/status") +def internal_update_job_status(job_id: str, request: JobStatusUpdateRequest, _: bool = Depends(require_orchestrator)) -> dict[str, Any]: + updated = update_job_state( + job_id, + status=request.status, + error=request.error, + provider_name=request.provider_name or None, + provider_task_id=request.provider_task_id or None, + artifacts=request.artifacts, + result=request.result, + ) + return job_context_payload(updated) + + +@app.get("/v2/admin/accounts/pending") +def pending_accounts(admin: dict[str, Any] = Depends(require_super_admin)) -> list[dict[str, Any]]: + rows = db.fetch_all("SELECT * FROM accounts WHERE approval_status = 'pending' ORDER BY created_at ASC") + return [normalize_account(row) for row in rows] + + +@app.post("/v2/admin/accounts/{account_id}/approve") +def approve_account(account_id: str, admin: dict[str, Any] = Depends(require_super_admin)) -> dict[str, Any]: + account = db.fetch_one("SELECT * FROM accounts WHERE id = ?", (account_id,)) + if not account: + raise HTTPException(status_code=404, detail="Account not found") + db.execute( + "UPDATE accounts SET approval_status = 'approved', approved_by = ?, approved_at = ?, updated_at = ? WHERE id = ?", + (admin["id"], utc_now(), utc_now(), account_id), + ) + approved = db.fetch_one("SELECT * FROM accounts WHERE id = ?", (account_id,)) + project = ensure_default_project(account_id, username=approved["username"]) + ensure_user_kb(account_id, project["id"], username=approved["username"]) + return {"saved": True, "account": normalize_account(approved)} + + +@app.post("/v2/admin/accounts/{account_id}/reject") +def reject_account(account_id: str, admin: dict[str, Any] = Depends(require_super_admin)) -> dict[str, Any]: + account = db.fetch_one("SELECT * FROM accounts WHERE id = ?", (account_id,)) + if not account: + raise HTTPException(status_code=404, detail="Account not found") + db.execute( + "UPDATE accounts SET approval_status = 'rejected', approved_by = ?, approved_at = ?, updated_at = ? WHERE id = ?", + (admin["id"], utc_now(), utc_now(), account_id), + ) + rejected = db.fetch_one("SELECT * FROM accounts WHERE id = ?", (account_id,)) + return {"saved": True, "account": normalize_account(rejected)} + + +@app.get("/api/v1/app/update/latest") +def latest_update( + platform: str = Query(default="android"), + channel: str = Query(default="stable"), + currentVersionCode: int | None = Query(default=None), +) -> dict[str, Any]: + row = db.fetch_one( + "SELECT * FROM app_updates WHERE platform = ? AND channel = ? AND is_active = 1 ORDER BY version_code DESC, published_at DESC LIMIT 1", + (platform, channel), + ) + if not row: + return { + "platform": platform, + "channel": channel, + "hasUpdate": False, + "latestVersionCode": currentVersionCode or 0, + "latestVersionName": "", + "minSupportedCode": 0, + "downloadUrl": "", + "apkSha256": "", + "releaseNotes": "", + "forceUpdate": False, + "publishedAt": 0, + } + latest_version_code = int(row["version_code"]) + return { + "platform": row["platform"], + "channel": row["channel"], + "hasUpdate": currentVersionCode is None or latest_version_code > currentVersionCode, + "latestVersionCode": latest_version_code, + "latestVersionName": row["version_name"], + "minSupportedCode": int(row["min_supported_code"]), + "downloadUrl": row["apk_url"], + "apkSha256": row.get("apk_sha256", ""), + "releaseNotes": row.get("notes", ""), + "forceUpdate": bool(row.get("force_update", 0)), + "publishedAt": int(row.get("published_at", 0)), + } + + +@app.post("/v2/admin/app/update/publish") +def publish_app_update(request: PublishAppUpdateRequest, admin: dict[str, Any] = Depends(require_super_admin)) -> dict[str, Any]: + db.execute( + "UPDATE app_updates SET is_active = 0 WHERE platform = ? AND channel = ?", + (request.platform, request.channel), + ) + db.execute( + """ + INSERT INTO app_updates ( + platform, channel, version_code, version_name, min_supported_code, + apk_url, apk_sha256, notes, force_update, is_active, published_at, created_by + ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + """, + ( + request.platform, + request.channel, + request.versionCode, + request.versionName, + request.minSupportedCode, + request.apkUrl, + request.apkSha256, + request.notes, + 1 if request.forceUpdate else 0, + 1 if request.isActive else 0, + now_ts(), + admin["id"], + ), + ) + row = db.fetch_one( + """ + SELECT id + FROM app_updates + WHERE platform = ? AND channel = ? AND version_code = ? + ORDER BY id DESC + LIMIT 1 + """, + (request.platform, request.channel, request.versionCode), + ) + return {"saved": True, "action": "published", "updateId": row["id"] if row else 0} diff --git a/collector-service/app/database.py b/collector-service/app/database.py new file mode 100644 index 0000000..c3bb5a2 --- /dev/null +++ b/collector-service/app/database.py @@ -0,0 +1,340 @@ +from __future__ import annotations + +import sqlite3 +from contextlib import contextmanager +from pathlib import Path +from typing import Any, Iterator + + +def utc_now() -> str: + from datetime import datetime, timezone + + return datetime.now(timezone.utc).replace(microsecond=0).isoformat() + + +def dict_factory(cursor: sqlite3.Cursor, row: sqlite3.Row) -> dict[str, Any]: + return {col[0]: row[idx] for idx, col in enumerate(cursor.description)} + + +class Database: + def __init__(self, path: str) -> None: + self.path = Path(path) + self.path.parent.mkdir(parents=True, exist_ok=True) + + def connect(self) -> sqlite3.Connection: + conn = sqlite3.connect(self.path) + conn.row_factory = dict_factory + conn.execute("PRAGMA foreign_keys = ON") + return conn + + @contextmanager + def session(self) -> Iterator[sqlite3.Connection]: + conn = self.connect() + try: + yield conn + conn.commit() + finally: + conn.close() + + def fetch_one(self, sql: str, params: tuple[Any, ...] = ()) -> dict[str, Any] | None: + with self.session() as conn: + return conn.execute(sql, params).fetchone() + + def fetch_all(self, sql: str, params: tuple[Any, ...] = ()) -> list[dict[str, Any]]: + with self.session() as conn: + return list(conn.execute(sql, params).fetchall()) + + def execute(self, sql: str, params: tuple[Any, ...] = ()) -> None: + with self.session() as conn: + conn.execute(sql, params) + + def table_exists(self, name: str) -> bool: + row = self.fetch_one( + "SELECT name FROM sqlite_master WHERE type = 'table' AND name = ?", + (name,), + ) + return bool(row) + + def column_exists(self, table: str, column: str) -> bool: + with self.session() as conn: + rows = conn.execute(f"PRAGMA table_info({table})").fetchall() + return any(row["name"] == column for row in rows) + + def init_schema(self) -> None: + schema = """ + CREATE TABLE IF NOT EXISTS accounts ( + id TEXT PRIMARY KEY, + username TEXT NOT NULL UNIQUE, + password_hash TEXT NOT NULL, + password_salt TEXT NOT NULL, + display_name TEXT NOT NULL, + role TEXT NOT NULL, + approval_status TEXT NOT NULL, + approved_by TEXT, + approved_at TEXT, + preferred_analysis_model_id TEXT, + created_at TEXT NOT NULL, + updated_at TEXT NOT NULL + ); + + CREATE TABLE IF NOT EXISTS auth_tokens ( + token TEXT PRIMARY KEY, + account_id TEXT NOT NULL, + created_at TEXT NOT NULL, + FOREIGN KEY(account_id) REFERENCES accounts(id) ON DELETE CASCADE + ); + + CREATE TABLE IF NOT EXISTS model_profiles ( + id TEXT PRIMARY KEY, + owner_account_id TEXT, + name TEXT NOT NULL, + provider TEXT NOT NULL, + base_url TEXT NOT NULL, + api_key TEXT NOT NULL DEFAULT '', + model_name TEXT NOT NULL, + is_system INTEGER NOT NULL DEFAULT 0, + is_default INTEGER NOT NULL DEFAULT 0, + created_at TEXT NOT NULL, + updated_at TEXT NOT NULL, + FOREIGN KEY(owner_account_id) REFERENCES accounts(id) ON DELETE CASCADE + ); + + CREATE TABLE IF NOT EXISTS knowledge_bases ( + id TEXT PRIMARY KEY, + user_id TEXT NOT NULL, + project_id TEXT, + name TEXT NOT NULL, + description TEXT NOT NULL DEFAULT '', + sync_status TEXT NOT NULL DEFAULT 'ready', + created_at TEXT NOT NULL, + updated_at TEXT NOT NULL, + FOREIGN KEY(user_id) REFERENCES accounts(id) ON DELETE CASCADE + ); + + CREATE TABLE IF NOT EXISTS knowledge_documents ( + id TEXT PRIMARY KEY, + knowledge_base_id TEXT NOT NULL, + title TEXT NOT NULL, + source_type TEXT NOT NULL, + source_url TEXT NOT NULL DEFAULT '', + transcript_text TEXT NOT NULL DEFAULT '', + style_summary TEXT NOT NULL DEFAULT '', + combined_text TEXT NOT NULL DEFAULT '', + analysis_json TEXT NOT NULL DEFAULT '{}', + storyboard_json TEXT NOT NULL DEFAULT '[]', + source_artifact_json TEXT NOT NULL DEFAULT '{}', + analysis_model_profile_id TEXT NOT NULL DEFAULT '', + created_at TEXT NOT NULL, + updated_at TEXT NOT NULL, + FOREIGN KEY(knowledge_base_id) REFERENCES knowledge_bases(id) ON DELETE CASCADE + ); + + CREATE TABLE IF NOT EXISTS assistants ( + id TEXT PRIMARY KEY, + user_id TEXT NOT NULL, + project_id TEXT, + name TEXT NOT NULL, + description TEXT NOT NULL DEFAULT '', + system_prompt TEXT NOT NULL DEFAULT '', + generation_goal TEXT NOT NULL DEFAULT '', + config_json TEXT NOT NULL DEFAULT '{}', + model_profile_id TEXT NOT NULL DEFAULT '', + created_at TEXT NOT NULL, + updated_at TEXT NOT NULL, + FOREIGN KEY(user_id) REFERENCES accounts(id) ON DELETE CASCADE + ); + + CREATE TABLE IF NOT EXISTS assistant_knowledge_bases ( + assistant_id TEXT NOT NULL, + knowledge_base_id TEXT NOT NULL, + PRIMARY KEY (assistant_id, knowledge_base_id), + FOREIGN KEY(assistant_id) REFERENCES assistants(id) ON DELETE CASCADE, + FOREIGN KEY(knowledge_base_id) REFERENCES knowledge_bases(id) ON DELETE CASCADE + ); + + CREATE TABLE IF NOT EXISTS jobs ( + id TEXT PRIMARY KEY, + user_id TEXT NOT NULL, + project_id TEXT, + parent_job_id TEXT, + assistant_id TEXT, + knowledge_base_id TEXT NOT NULL, + content_source_id TEXT, + source_type TEXT NOT NULL, + line_type TEXT NOT NULL DEFAULT 'analysis', + workflow_key TEXT NOT NULL DEFAULT '', + orchestrator TEXT NOT NULL DEFAULT 'n8n', + provider_name TEXT NOT NULL DEFAULT '', + provider_task_id TEXT NOT NULL DEFAULT '', + source_url TEXT, + title TEXT NOT NULL, + language TEXT NOT NULL DEFAULT 'auto', + status TEXT NOT NULL, + transcript_text TEXT NOT NULL DEFAULT '', + style_summary TEXT NOT NULL DEFAULT '', + upload_status TEXT NOT NULL DEFAULT 'pending', + error TEXT NOT NULL DEFAULT '', + artifacts_json TEXT NOT NULL DEFAULT '{}', + result_json TEXT NOT NULL DEFAULT '{}', + analysis_model_profile_id TEXT NOT NULL DEFAULT '', + created_at TEXT NOT NULL, + updated_at TEXT NOT NULL, + FOREIGN KEY(user_id) REFERENCES accounts(id) ON DELETE CASCADE, + FOREIGN KEY(assistant_id) REFERENCES assistants(id) ON DELETE SET NULL, + FOREIGN KEY(knowledge_base_id) REFERENCES knowledge_bases(id) ON DELETE CASCADE + ); + + CREATE TABLE IF NOT EXISTS projects ( + id TEXT PRIMARY KEY, + user_id TEXT NOT NULL, + name TEXT NOT NULL, + description TEXT NOT NULL DEFAULT '', + created_at TEXT NOT NULL, + updated_at TEXT NOT NULL, + FOREIGN KEY(user_id) REFERENCES accounts(id) ON DELETE CASCADE + ); + + CREATE TABLE IF NOT EXISTS content_sources ( + id TEXT PRIMARY KEY, + user_id TEXT NOT NULL, + project_id TEXT, + source_kind TEXT NOT NULL, + platform TEXT NOT NULL DEFAULT '', + handle TEXT NOT NULL DEFAULT '', + source_url TEXT NOT NULL DEFAULT '', + title TEXT NOT NULL DEFAULT '', + local_path TEXT NOT NULL DEFAULT '', + metadata_json TEXT NOT NULL DEFAULT '{}', + created_at TEXT NOT NULL, + updated_at TEXT NOT NULL, + FOREIGN KEY(user_id) REFERENCES accounts(id) ON DELETE CASCADE, + FOREIGN KEY(project_id) REFERENCES projects(id) ON DELETE SET NULL + ); + + CREATE TABLE IF NOT EXISTS job_events ( + id TEXT PRIMARY KEY, + job_id TEXT NOT NULL, + event_type TEXT NOT NULL, + payload_json TEXT NOT NULL DEFAULT '{}', + created_at TEXT NOT NULL, + FOREIGN KEY(job_id) REFERENCES jobs(id) ON DELETE CASCADE + ); + + CREATE TABLE IF NOT EXISTS app_updates ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + platform TEXT NOT NULL, + channel TEXT NOT NULL, + version_code INTEGER NOT NULL, + version_name TEXT NOT NULL, + min_supported_code INTEGER NOT NULL, + apk_url TEXT NOT NULL, + apk_sha256 TEXT NOT NULL DEFAULT '', + notes TEXT NOT NULL DEFAULT '', + force_update INTEGER NOT NULL DEFAULT 0, + is_active INTEGER NOT NULL DEFAULT 1, + published_at INTEGER NOT NULL, + created_by TEXT NOT NULL + ); + """ + with self.session() as conn: + conn.executescript(schema) + self.migrate_schema() + + def migrate_schema(self) -> None: + table_columns: dict[str, dict[str, str]] = { + "knowledge_bases": { + "project_id": "TEXT", + }, + "knowledge_documents": { + "analysis_json": "TEXT NOT NULL DEFAULT '{}'", + "storyboard_json": "TEXT NOT NULL DEFAULT '[]'", + "source_artifact_json": "TEXT NOT NULL DEFAULT '{}'", + }, + "assistants": { + "project_id": "TEXT", + "config_json": "TEXT NOT NULL DEFAULT '{}'", + }, + "jobs": { + "project_id": "TEXT", + "parent_job_id": "TEXT", + "content_source_id": "TEXT", + "line_type": "TEXT NOT NULL DEFAULT 'analysis'", + "workflow_key": "TEXT NOT NULL DEFAULT ''", + "orchestrator": "TEXT NOT NULL DEFAULT 'n8n'", + "provider_name": "TEXT NOT NULL DEFAULT ''", + "provider_task_id": "TEXT NOT NULL DEFAULT ''", + "result_json": "TEXT NOT NULL DEFAULT '{}'", + }, + } + + for table, columns in table_columns.items(): + if not self.table_exists(table): + continue + for column, definition in columns.items(): + if self.column_exists(table, column): + continue + self.execute(f"ALTER TABLE {table} ADD COLUMN {column} {definition}") + + self.ensure_default_projects() + + def ensure_default_projects(self) -> None: + if not self.table_exists("projects"): + return + + accounts = self.fetch_all("SELECT id, username FROM accounts ORDER BY created_at ASC") + for account in accounts: + project = self.fetch_one( + "SELECT * FROM projects WHERE user_id = ? ORDER BY created_at ASC LIMIT 1", + (account["id"],), + ) + if not project: + project_id = f"proj_{account['id']}" + now = utc_now() + self.execute( + """ + INSERT INTO projects (id, user_id, name, description, created_at, updated_at) + VALUES (?, ?, ?, ?, ?, ?) + """, + ( + project_id, + account["id"], + f"{account['username']} 默认项目", + "系统自动创建的默认项目", + now, + now, + ), + ) + project = self.fetch_one("SELECT * FROM projects WHERE id = ?", (project_id,)) + + if not project: + continue + + if self.column_exists("knowledge_bases", "project_id"): + self.execute( + """ + UPDATE knowledge_bases + SET project_id = ? + WHERE user_id = ? AND (project_id IS NULL OR project_id = '') + """, + (project["id"], account["id"]), + ) + + if self.column_exists("assistants", "project_id"): + self.execute( + """ + UPDATE assistants + SET project_id = ? + WHERE user_id = ? AND (project_id IS NULL OR project_id = '') + """, + (project["id"], account["id"]), + ) + + if self.column_exists("jobs", "project_id"): + self.execute( + """ + UPDATE jobs + SET project_id = ? + WHERE user_id = ? AND (project_id IS NULL OR project_id = '') + """, + (project["id"], account["id"]), + ) diff --git a/collector-service/app/douyin_features.py b/collector-service/app/douyin_features.py new file mode 100644 index 0000000..ccc248e --- /dev/null +++ b/collector-service/app/douyin_features.py @@ -0,0 +1,1980 @@ +from __future__ import annotations + +import asyncio +import json +import re +from collections import Counter +from datetime import datetime, timezone +from html import unescape +from typing import Any, Iterable +from urllib.parse import quote, unquote + +import httpx +from fastapi import Depends, HTTPException +from pydantic import BaseModel, Field + +DEFAULT_CREATOR_CENTER_URLS = [ + "https://creator.douyin.com/creator-micro/home", + "https://creator.douyin.com/creator-micro/data", + "https://creator.douyin.com/creator-micro/content/manage" +] +DEFAULT_TIMEOUT = 20.0 +MAX_HTML_SEARCH_BYTES = 2_000_000 +DEFAULT_USER_AGENT = ( + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) " + "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36" +) + + +class ManualPageCapture(BaseModel): + url: str = "" + title: str = "" + payload: dict[str, Any] = Field(default_factory=dict) + + +class DouyinAccountSyncRequest(BaseModel): + profile_url: str = "" + session_cookie: str = "" + creator_center_urls: list[str] = Field(default_factory=lambda: list(DEFAULT_CREATOR_CENTER_URLS)) + manual_profile_payload: dict[str, Any] | None = None + manual_creator_pages: list[ManualPageCapture] = Field(default_factory=list) + manual_work_payloads: list[dict[str, Any]] = Field(default_factory=list) + discovery_note: str = "" + + +class DouyinAccountAnalysisRequest(BaseModel): + model_profile_ids: list[str] = Field(default_factory=list) + linked_account_ids: list[str] = Field(default_factory=list) + include_linked_accounts: bool = True + include_recent_similar_candidates: bool = True + max_videos: int = 12 + extra_focus: str = "" + temperature: float = 0.35 + + +class DouyinSimilarSearchRequest(BaseModel): + source_account_id: str | None = None + profile_url: str | None = None + candidate_urls: list[str] = Field(default_factory=list) + seed_linked_accounts: bool = True + search_public_pages: bool = True + model_profile_id: str | None = None + max_candidates: int = 10 + extra_requirements: str = "" + + +class DouyinBenchmarkLinkRequest(BaseModel): + target_account_ids: list[str] = Field(default_factory=list) + target_profile_urls: list[str] = Field(default_factory=list) + relation_type: str = "benchmark" + note: str = "" + search_id: str = "" + + +def _safe_json_dumps(value: Any) -> str: + return json.dumps(value, ensure_ascii=False, separators=(",", ":")) + + +def _safe_json_loads(value: str | None, fallback: Any) -> Any: + if not value: + return fallback + try: + return json.loads(value) + except Exception: + return fallback + + +def _first_non_empty(*values: Any) -> str: + for value in values: + if value is None: + continue + if isinstance(value, str): + stripped = value.strip() + if stripped: + return stripped + elif value not in ("", [], {}, ()): + return str(value) + return "" + + +def _dedupe_strings(values: Iterable[str]) -> list[str]: + result: list[str] = [] + seen: set[str] = set() + for value in values: + item = value.strip() + if not item: + continue + key = item.lower() + if key in seen: + continue + seen.add(key) + result.append(item) + return result + + +def _compact_text(value: Any, limit: int = 500) -> str: + text = str(value or "").strip() + if len(text) <= limit: + return text + return f"{text[: limit - 1]}…" + + +def _parse_count(value: Any) -> float: + if value is None: + return 0.0 + if isinstance(value, (int, float)): + return float(value) + text = str(value).strip().lower().replace(",", "") + if not text: + return 0.0 + + multiplier = 1.0 + if text.endswith("w") or text.endswith("万"): + multiplier = 10_000.0 + text = text[:-1] + elif text.endswith("亿"): + multiplier = 100_000_000.0 + text = text[:-1] + + text = text.replace("+", "") + match = re.search(r"-?\d+(?:\.\d+)?", text) + if not match: + return 0.0 + try: + return float(match.group()) * multiplier + except ValueError: + return 0.0 + + +def _normalize_timestamp(value: Any) -> str | None: + if value in (None, "", 0, "0"): + return None + if isinstance(value, str): + stripped = value.strip() + if not stripped: + return None + if re.match(r"^\d{4}-\d{2}-\d{2}T", stripped): + return stripped + if stripped.isdigit(): + value = int(stripped) + else: + return stripped + if isinstance(value, (int, float)): + ts = float(value) + if ts > 10_000_000_000: + ts /= 1000.0 + try: + return datetime.fromtimestamp(ts, tz=timezone.utc).replace(microsecond=0).isoformat() + except Exception: + return None + return None + + +def _extract_hashtags(*texts: str) -> list[str]: + tags: list[str] = [] + for text in texts: + if not text: + continue + tags.extend(match.group(1) for match in re.finditer(r"#([\w\u4e00-\u9fff]+)", text)) + return _dedupe_strings(tags) + + +def _extract_keywords(*texts: str) -> list[str]: + candidates: list[str] = [] + for text in texts: + if not text: + continue + candidates.extend(_extract_hashtags(text)) + candidates.extend(re.findall(r"[\u4e00-\u9fff]{2,8}", text)) + candidates.extend(re.findall(r"[A-Za-z][A-Za-z0-9_]{2,20}", text)) + stop_words = { + "视频", + "作品", + "抖音", + "账号", + "内容", + "发布", + "更多", + "关注", + "用户", + "douyin", + "profile" + } + filtered = [item for item in candidates if item.lower() not in stop_words] + return _dedupe_strings(filtered) + + +def _flatten_json(value: Any, prefix: str = "") -> list[tuple[str, str, str]]: + rows: list[tuple[str, str, str]] = [] + if isinstance(value, dict): + for key, child in value.items(): + next_prefix = f"{prefix}.{key}" if prefix else str(key) + rows.extend(_flatten_json(child, next_prefix)) + elif isinstance(value, list): + for index, child in enumerate(value): + next_prefix = f"{prefix}[{index}]" + rows.extend(_flatten_json(child, next_prefix)) + else: + field_type = type(value).__name__ + rows.append((prefix or "$", field_type, _compact_text(value, 2000))) + return rows + + +def _walk_json(value: Any) -> Iterable[dict[str, Any]]: + if isinstance(value, dict): + yield value + for child in value.values(): + yield from _walk_json(child) + elif isinstance(value, list): + for child in value: + yield from _walk_json(child) + + +def _extract_json_objects_from_text(text: str) -> list[Any]: + decoder = json.JSONDecoder() + objects: list[Any] = [] + seen: set[str] = set() + if not text: + return objects + + candidates = [text, unquote(text), unescape(text), unescape(unquote(text))] + for candidate in candidates: + snippet = candidate[:MAX_HTML_SEARCH_BYTES] + for match in re.finditer(r"[\{\[]", snippet): + try: + obj, _ = decoder.raw_decode(snippet[match.start() :]) + except Exception: + continue + marker = _safe_json_dumps(obj) + if marker in seen: + continue + seen.add(marker) + objects.append(obj) + if len(objects) >= 50: + return objects + return objects + + +def _extract_json_blobs_from_html(html: str) -> list[dict[str, Any]]: + blobs: list[dict[str, Any]] = [] + seen: set[str] = set() + for attrs, content in re.findall(r"]*)>(.*?)", html, re.IGNORECASE | re.DOTALL): + script_id_match = re.search(r'id=["\']([^"\']+)["\']', attrs, re.IGNORECASE) + script_id = script_id_match.group(1) if script_id_match else "" + for obj in _extract_json_objects_from_text(content.strip()): + marker = _safe_json_dumps(obj) + if marker in seen: + continue + seen.add(marker) + blobs.append({ + "script_id": script_id, + "payload": obj + }) + return blobs + + +def _profile_candidate_score(value: dict[str, Any]) -> int: + score = 0 + interesting_keys = { + "nickname", + "signature", + "sec_uid", + "secUid", + "uid", + "unique_id", + "short_id", + "aweme_count", + "following_count", + "follower_count", + "total_favorited" + } + score += sum(1 for key in interesting_keys if key in value) + if "author" in value and isinstance(value["author"], dict): + score += 2 + return score + + +def _video_candidate_score(value: dict[str, Any]) -> int: + score = 0 + if "statistics" in value and isinstance(value["statistics"], dict): + score += 3 + if "aweme_id" in value or "item_id" in value: + score += 2 + if "desc" in value or "title" in value: + score += 1 + return score + + +def _extract_profile_candidates(payload: Any) -> list[dict[str, Any]]: + candidates: list[dict[str, Any]] = [] + for item in _walk_json(payload): + if _profile_candidate_score(item) >= 3: + candidates.append(item) + if "author" in item and isinstance(item["author"], dict) and _profile_candidate_score(item["author"]) >= 3: + candidates.append(item["author"]) + return candidates + + +def _extract_video_candidates(payload: Any) -> list[dict[str, Any]]: + candidates: list[dict[str, Any]] = [] + for item in _walk_json(payload): + if _video_candidate_score(item) >= 3: + candidates.append(item) + return candidates + + +def _normalize_profile_candidate(candidate: dict[str, Any], fallback_url: str = "") -> dict[str, Any]: + stats_source = candidate.get("statistics") if isinstance(candidate.get("statistics"), dict) else {} + avatar = candidate.get("avatar_medium") or candidate.get("avatar_thumb") or candidate.get("avatar_url") + if isinstance(avatar, dict): + avatar = _first_non_empty( + avatar.get("url_list", [""])[0] if isinstance(avatar.get("url_list"), list) else "", + avatar.get("url") + ) + + signature = _first_non_empty( + candidate.get("signature"), + candidate.get("desc"), + candidate.get("bio"), + candidate.get("description") + ) + nickname = _first_non_empty(candidate.get("nickname"), candidate.get("name"), candidate.get("author_name")) + canonical_url = _first_non_empty( + candidate.get("share_url"), + candidate.get("profile_url"), + fallback_url + ) + return { + "nickname": nickname, + "signature": signature, + "profile_url": canonical_url, + "canonical_profile_url": canonical_url, + "sec_uid": _first_non_empty(candidate.get("sec_uid"), candidate.get("secUid")), + "douyin_uid": _first_non_empty(candidate.get("uid")), + "douyin_id": _first_non_empty(candidate.get("unique_id"), candidate.get("short_id"), candidate.get("douyin_id")), + "avatar_url": _first_non_empty(avatar), + "stats": { + "followers": _parse_count(candidate.get("follower_count") or stats_source.get("follower_count")), + "following": _parse_count(candidate.get("following_count") or stats_source.get("following_count")), + "likes": _parse_count(candidate.get("total_favorited") or stats_source.get("total_favorited")), + "videos": _parse_count(candidate.get("aweme_count") or stats_source.get("aweme_count")) + }, + "tags": _dedupe_strings( + _extract_hashtags(signature, nickname) + + [str(tag) for tag in candidate.get("tags", []) if isinstance(tag, (str, int, float))] + ), + "raw": candidate + } + + +def _pick_best_profile(candidates: list[dict[str, Any]], fallback_url: str = "") -> dict[str, Any]: + best: dict[str, Any] | None = None + best_score = -1 + for candidate in candidates: + normalized = _normalize_profile_candidate(candidate, fallback_url=fallback_url) + score = 0 + score += 4 if normalized["nickname"] else 0 + score += 3 if normalized["sec_uid"] else 0 + score += 2 if normalized["signature"] else 0 + score += 1 if normalized["stats"]["followers"] else 0 + if score > best_score: + best = normalized + best_score = score + return best or _normalize_profile_candidate({}, fallback_url=fallback_url) + + +def _normalize_video_candidate(candidate: dict[str, Any]) -> dict[str, Any]: + stats_source = candidate.get("statistics") if isinstance(candidate.get("statistics"), dict) else {} + video_source = candidate.get("video") if isinstance(candidate.get("video"), dict) else {} + title = _first_non_empty(candidate.get("title"), candidate.get("desc"), candidate.get("share_title")) + description = _first_non_empty(candidate.get("desc"), candidate.get("title"), candidate.get("text")) + cover = candidate.get("cover") or video_source.get("cover") + if isinstance(cover, dict): + cover = _first_non_empty( + cover.get("url_list", [""])[0] if isinstance(cover.get("url_list"), list) else "", + cover.get("url") + ) + return { + "aweme_id": _first_non_empty(candidate.get("aweme_id"), candidate.get("item_id"), candidate.get("group_id")), + "title": title, + "description": description, + "share_url": _first_non_empty(candidate.get("share_url")), + "cover_url": _first_non_empty(cover), + "duration_sec": float(candidate.get("duration") or video_source.get("duration") or 0) / 1000.0 + if float(candidate.get("duration") or video_source.get("duration") or 0) > 1000 + else float(candidate.get("duration") or video_source.get("duration") or 0), + "published_at": _normalize_timestamp(candidate.get("create_time") or candidate.get("publish_time")), + "tags": _extract_hashtags(title, description), + "stats": { + "play": _parse_count(stats_source.get("play_count") or candidate.get("play_count")), + "like": _parse_count(stats_source.get("digg_count") or candidate.get("digg_count")), + "comment": _parse_count(stats_source.get("comment_count") or candidate.get("comment_count")), + "share": _parse_count(stats_source.get("share_count") or candidate.get("share_count")), + "collect": _parse_count(stats_source.get("collect_count") or candidate.get("collect_count")) + }, + "raw": candidate + } + + +def _extract_videos(payloads: Iterable[Any]) -> list[dict[str, Any]]: + videos: list[dict[str, Any]] = [] + seen: set[str] = set() + for payload in payloads: + for candidate in _extract_video_candidates(payload): + normalized = _normalize_video_candidate(candidate) + dedupe_key = normalized["aweme_id"] or normalized["share_url"] or normalized["title"] + if not dedupe_key or dedupe_key in seen: + continue + seen.add(dedupe_key) + videos.append(normalized) + videos.sort( + key=lambda item: ( + item["stats"]["play"] + item["stats"]["like"] + item["stats"]["comment"] * 4 + item["stats"]["share"] * 6 + ), + reverse=True + ) + return videos + + +async def _fetch_html(url: str, cookie: str = "") -> tuple[str, str]: + headers = { + "User-Agent": DEFAULT_USER_AGENT, + "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8" + } + if cookie.strip(): + headers["Cookie"] = cookie.strip() + async with httpx.AsyncClient(timeout=DEFAULT_TIMEOUT, follow_redirects=True) as client: + response = await client.get(url, headers=headers) + response.raise_for_status() + return str(response.url), response.text + + +async def _discover_profile_urls_from_search(keywords: list[str], limit: int = 8) -> list[str]: + urls: list[str] = [] + seen: set[str] = set() + for keyword in keywords[:3]: + search_url = f"https://www.douyin.com/search/{quote(keyword)}?type=user" + try: + _, html = await _fetch_html(search_url) + except Exception: + continue + for match in re.findall(r'href=["\']([^"\']+/user/[^"\']+)["\']', html): + if match.startswith("/"): + match = f"https://www.douyin.com{match}" + cleaned = match.split("?")[0] + if cleaned in seen: + continue + seen.add(cleaned) + urls.append(cleaned) + if len(urls) >= limit: + return urls + return urls + + +def _summarize_videos(videos: list[dict[str, Any]], limit: int = 8) -> dict[str, Any]: + selected = videos[:limit] + if not selected: + return { + "count": 0, + "top_tags": [], + "avg_play": 0.0, + "avg_like": 0.0, + "avg_comment": 0.0, + "avg_share": 0.0, + "videos": [] + } + count = len(selected) + avg_play = sum(item["stats"]["play"] for item in selected) / count + avg_like = sum(item["stats"]["like"] for item in selected) / count + avg_comment = sum(item["stats"]["comment"] for item in selected) / count + avg_share = sum(item["stats"]["share"] for item in selected) / count + tag_counter = Counter(tag for item in selected for tag in item.get("tags", [])) + return { + "count": len(videos), + "top_tags": [tag for tag, _ in tag_counter.most_common(8)], + "avg_play": round(avg_play, 2), + "avg_like": round(avg_like, 2), + "avg_comment": round(avg_comment, 2), + "avg_share": round(avg_share, 2), + "videos": [ + { + "aweme_id": item["aweme_id"], + "title": _compact_text(item["title"], 120), + "description": _compact_text(item["description"], 180), + "tags": item["tags"][:6], + "published_at": item["published_at"], + "stats": item["stats"] + } + for item in selected + ] + } + + +def _jaccard(left: Iterable[str], right: Iterable[str]) -> float: + left_set = {item.strip().lower() for item in left if item.strip()} + right_set = {item.strip().lower() for item in right if item.strip()} + if not left_set and not right_set: + return 0.0 + intersection = len(left_set & right_set) + union = len(left_set | right_set) + return intersection / union if union else 0.0 + + +def _quality_score(account_payload: dict[str, Any]) -> float: + stats = account_payload.get("profile_stats", {}) + followers = float(stats.get("followers") or 0) + video_summary = account_payload.get("video_summary", {}) + avg_play = float(video_summary.get("avg_play") or 0) + avg_like = float(video_summary.get("avg_like") or 0) + avg_comment = float(video_summary.get("avg_comment") or 0) + avg_share = float(video_summary.get("avg_share") or 0) + base = followers / 10_000.0 + engagement = avg_like / 1000.0 + avg_comment / 300.0 + avg_share / 200.0 + avg_play / 5000.0 + return round(base + engagement, 3) + + +def _heuristic_similarity(source_payload: dict[str, Any], candidate_payload: dict[str, Any]) -> dict[str, Any]: + source_keywords = source_payload.get("keywords", []) + candidate_keywords = candidate_payload.get("keywords", []) + topic_overlap = _jaccard(source_keywords, candidate_keywords) + tag_overlap = _jaccard( + source_payload.get("video_summary", {}).get("top_tags", []), + candidate_payload.get("video_summary", {}).get("top_tags", []) + ) + source_signature = source_payload.get("signature", "") + candidate_signature = candidate_payload.get("signature", "") + signature_overlap = _jaccard(_extract_keywords(source_signature), _extract_keywords(candidate_signature)) + quality = _quality_score(candidate_payload) + score = round(topic_overlap * 55 + tag_overlap * 20 + signature_overlap * 10 + min(quality, 15), 2) + return { + "topic_overlap": round(topic_overlap, 3), + "tag_overlap": round(tag_overlap, 3), + "signature_overlap": round(signature_overlap, 3), + "quality_score": quality, + "heuristic_score": score + } + + +def _build_model_label(profile: dict[str, Any]) -> str: + return _first_non_empty(profile.get("name"), profile.get("model_name"), profile.get("base_url")) + + +def _try_parse_agent_json(text: str) -> Any: + stripped = text.strip() + if not stripped: + return {} + try: + return json.loads(stripped) + except Exception: + pass + objects = _extract_json_objects_from_text(stripped) + return objects[0] if objects else {} + + +def register_douyin_routes(app: Any, legacy: Any) -> None: + def now() -> str: + return legacy.utc_now() + + def make_id(prefix: str) -> str: + return legacy.make_id(prefix) + + def ensure_schema() -> None: + schema = """ + CREATE TABLE IF NOT EXISTS douyin_accounts ( + id TEXT PRIMARY KEY, + user_id TEXT NOT NULL, + profile_url TEXT NOT NULL DEFAULT '', + canonical_profile_url TEXT NOT NULL DEFAULT '', + sec_uid TEXT NOT NULL DEFAULT '', + douyin_uid TEXT NOT NULL DEFAULT '', + douyin_id TEXT NOT NULL DEFAULT '', + nickname TEXT NOT NULL DEFAULT '', + signature TEXT NOT NULL DEFAULT '', + avatar_url TEXT NOT NULL DEFAULT '', + tags_json TEXT NOT NULL DEFAULT '[]', + profile_stats_json TEXT NOT NULL DEFAULT '{}', + raw_profile_json TEXT NOT NULL DEFAULT '{}', + source_mode TEXT NOT NULL DEFAULT 'public', + sync_status TEXT NOT NULL DEFAULT 'pending', + last_public_sync_at TEXT, + last_creator_sync_at TEXT, + last_analysis_at TEXT, + created_at TEXT NOT NULL, + updated_at TEXT NOT NULL, + FOREIGN KEY(user_id) REFERENCES accounts(id) ON DELETE CASCADE + ); + + CREATE INDEX IF NOT EXISTS idx_douyin_accounts_user_updated + ON douyin_accounts(user_id, updated_at DESC); + + CREATE INDEX IF NOT EXISTS idx_douyin_accounts_user_sec_uid + ON douyin_accounts(user_id, sec_uid); + + CREATE TABLE IF NOT EXISTS douyin_account_snapshots ( + id TEXT PRIMARY KEY, + account_id TEXT NOT NULL, + snapshot_type TEXT NOT NULL, + source_url TEXT NOT NULL DEFAULT '', + raw_payload_json TEXT NOT NULL DEFAULT '{}', + summary_json TEXT NOT NULL DEFAULT '{}', + field_count INTEGER NOT NULL DEFAULT 0, + collected_at TEXT NOT NULL, + created_at TEXT NOT NULL, + FOREIGN KEY(account_id) REFERENCES douyin_accounts(id) ON DELETE CASCADE + ); + + CREATE INDEX IF NOT EXISTS idx_douyin_snapshots_account_collected + ON douyin_account_snapshots(account_id, collected_at DESC); + + CREATE TABLE IF NOT EXISTS douyin_snapshot_fields ( + snapshot_id TEXT NOT NULL, + field_path TEXT NOT NULL, + field_type TEXT NOT NULL DEFAULT 'string', + field_value_text TEXT NOT NULL DEFAULT '', + PRIMARY KEY(snapshot_id, field_path), + FOREIGN KEY(snapshot_id) REFERENCES douyin_account_snapshots(id) ON DELETE CASCADE + ); + + CREATE TABLE IF NOT EXISTS douyin_videos ( + id TEXT PRIMARY KEY, + account_id TEXT NOT NULL, + aweme_id TEXT NOT NULL DEFAULT '', + title TEXT NOT NULL DEFAULT '', + description TEXT NOT NULL DEFAULT '', + share_url TEXT NOT NULL DEFAULT '', + cover_url TEXT NOT NULL DEFAULT '', + duration_sec REAL NOT NULL DEFAULT 0, + published_at TEXT, + tags_json TEXT NOT NULL DEFAULT '[]', + stats_json TEXT NOT NULL DEFAULT '{}', + raw_json TEXT NOT NULL DEFAULT '{}', + created_at TEXT NOT NULL, + updated_at TEXT NOT NULL, + FOREIGN KEY(account_id) REFERENCES douyin_accounts(id) ON DELETE CASCADE + ); + + CREATE INDEX IF NOT EXISTS idx_douyin_videos_account_updated + ON douyin_videos(account_id, updated_at DESC); + + CREATE INDEX IF NOT EXISTS idx_douyin_videos_account_aweme + ON douyin_videos(account_id, aweme_id); + + CREATE TABLE IF NOT EXISTS douyin_analysis_reports ( + id TEXT PRIMARY KEY, + account_id TEXT NOT NULL, + user_id TEXT NOT NULL, + focus_text TEXT NOT NULL DEFAULT '', + model_profile_ids_json TEXT NOT NULL DEFAULT '[]', + linked_account_ids_json TEXT NOT NULL DEFAULT '[]', + prompt_text TEXT NOT NULL DEFAULT '', + context_json TEXT NOT NULL DEFAULT '{}', + created_at TEXT NOT NULL, + FOREIGN KEY(account_id) REFERENCES douyin_accounts(id) ON DELETE CASCADE, + FOREIGN KEY(user_id) REFERENCES accounts(id) ON DELETE CASCADE + ); + + CREATE INDEX IF NOT EXISTS idx_douyin_analysis_reports_account_created + ON douyin_analysis_reports(account_id, created_at DESC); + + CREATE TABLE IF NOT EXISTS douyin_analysis_suggestions ( + id TEXT PRIMARY KEY, + report_id TEXT NOT NULL, + model_profile_id TEXT NOT NULL DEFAULT '', + model_label TEXT NOT NULL DEFAULT '', + status TEXT NOT NULL DEFAULT 'ok', + suggestion_text TEXT NOT NULL DEFAULT '', + parsed_json TEXT NOT NULL DEFAULT '{}', + created_at TEXT NOT NULL, + FOREIGN KEY(report_id) REFERENCES douyin_analysis_reports(id) ON DELETE CASCADE + ); + + CREATE INDEX IF NOT EXISTS idx_douyin_analysis_suggestions_report + ON douyin_analysis_suggestions(report_id, created_at ASC); + + CREATE TABLE IF NOT EXISTS douyin_similarity_searches ( + id TEXT PRIMARY KEY, + user_id TEXT NOT NULL, + source_account_id TEXT, + source_profile_url TEXT NOT NULL DEFAULT '', + keywords_json TEXT NOT NULL DEFAULT '[]', + prompt_text TEXT NOT NULL DEFAULT '', + context_json TEXT NOT NULL DEFAULT '{}', + created_at TEXT NOT NULL, + FOREIGN KEY(user_id) REFERENCES accounts(id) ON DELETE CASCADE, + FOREIGN KEY(source_account_id) REFERENCES douyin_accounts(id) ON DELETE SET NULL + ); + + CREATE INDEX IF NOT EXISTS idx_douyin_similarity_searches_user_created + ON douyin_similarity_searches(user_id, created_at DESC); + + CREATE TABLE IF NOT EXISTS douyin_similarity_candidates ( + id TEXT PRIMARY KEY, + search_id TEXT NOT NULL, + candidate_account_id TEXT, + candidate_profile_url TEXT NOT NULL DEFAULT '', + heuristic_score REAL NOT NULL DEFAULT 0, + agent_score REAL NOT NULL DEFAULT 0, + rationale_text TEXT NOT NULL DEFAULT '', + dimensions_json TEXT NOT NULL DEFAULT '{}', + raw_output_json TEXT NOT NULL DEFAULT '{}', + rank_index INTEGER NOT NULL DEFAULT 0, + created_at TEXT NOT NULL, + FOREIGN KEY(search_id) REFERENCES douyin_similarity_searches(id) ON DELETE CASCADE, + FOREIGN KEY(candidate_account_id) REFERENCES douyin_accounts(id) ON DELETE SET NULL + ); + + CREATE INDEX IF NOT EXISTS idx_douyin_similarity_candidates_search_rank + ON douyin_similarity_candidates(search_id, rank_index ASC); + + CREATE TABLE IF NOT EXISTS douyin_account_relations ( + id TEXT PRIMARY KEY, + user_id TEXT NOT NULL, + source_account_id TEXT NOT NULL, + target_account_id TEXT, + target_profile_url TEXT NOT NULL DEFAULT '', + relation_type TEXT NOT NULL DEFAULT 'benchmark', + note TEXT NOT NULL DEFAULT '', + search_id TEXT NOT NULL DEFAULT '', + created_at TEXT NOT NULL, + FOREIGN KEY(user_id) REFERENCES accounts(id) ON DELETE CASCADE, + FOREIGN KEY(source_account_id) REFERENCES douyin_accounts(id) ON DELETE CASCADE, + FOREIGN KEY(target_account_id) REFERENCES douyin_accounts(id) ON DELETE SET NULL + ); + + CREATE INDEX IF NOT EXISTS idx_douyin_account_relations_source + ON douyin_account_relations(source_account_id, created_at DESC); + """ + with legacy.db.session() as conn: + conn.executescript(schema) + + ensure_schema() + + @app.on_event("startup") + def _startup_douyin_schema() -> None: + ensure_schema() + + def _require_owned_account(account_id: str, user_id: str) -> dict[str, Any]: + row = legacy.db.fetch_one( + "SELECT * FROM douyin_accounts WHERE id = ? AND user_id = ?", + (account_id, user_id) + ) + if not row: + raise HTTPException(status_code=404, detail="Douyin account not found") + return row + + def _fetch_model_profiles(account_id: str) -> list[dict[str, Any]]: + return legacy.db.fetch_all( + """ + SELECT * + FROM model_profiles + WHERE owner_account_id IS NULL OR owner_account_id = ? + ORDER BY is_default DESC, created_at ASC + """, + (account_id,) + ) + + def _resolve_model_profiles(account: dict[str, Any], requested_ids: list[str]) -> list[dict[str, Any]]: + profiles = _fetch_model_profiles(account["id"]) + if not profiles: + raise HTTPException(status_code=400, detail="No available model profiles") + if not requested_ids: + return profiles + profile_map = {row["id"]: row for row in profiles} + missing = [profile_id for profile_id in requested_ids if profile_id not in profile_map] + if missing: + raise HTTPException(status_code=404, detail=f"Unknown model profiles: {', '.join(missing)}") + return [profile_map[profile_id] for profile_id in requested_ids] + + async def _collect_public_profile(profile_url: str, manual_payload: dict[str, Any] | None) -> dict[str, Any]: + source_url = profile_url.strip() + blobs: list[dict[str, Any]] = [] + errors: list[str] = [] + + if manual_payload: + blobs.append({"script_id": "manual_profile_payload", "payload": manual_payload}) + + if source_url: + try: + final_url, html = await _fetch_html(source_url) + source_url = final_url + blobs.extend(_extract_json_blobs_from_html(html)) + except Exception as exc: + errors.append(f"public_profile_fetch_failed: {exc}") + + payloads = [item["payload"] for item in blobs] + profile = _pick_best_profile( + [candidate for payload in payloads for candidate in _extract_profile_candidates(payload)], + fallback_url=source_url + ) + videos = _extract_videos(payloads) + return { + "profile": profile, + "videos": videos, + "raw_pages": blobs, + "errors": errors, + "source_url": source_url + } + + async def _collect_creator_center_pages( + urls: list[str], + cookie: str, + manual_pages: list[ManualPageCapture] + ) -> dict[str, Any]: + pages: list[dict[str, Any]] = [] + errors: list[str] = [] + + for page in manual_pages: + pages.append({ + "url": page.url, + "title": page.title, + "blobs": [{"script_id": "manual_creator_payload", "payload": page.payload}] + }) + + if cookie.strip(): + for url in urls: + try: + final_url, html = await _fetch_html(url, cookie=cookie) + pages.append({ + "url": final_url, + "title": "", + "blobs": _extract_json_blobs_from_html(html) + }) + except Exception as exc: + errors.append(f"creator_center_fetch_failed[{url}]: {exc}") + + return {"pages": pages, "errors": errors} + + def _upsert_account( + owner: dict[str, Any], + profile: dict[str, Any], + sync_request: DouyinAccountSyncRequest, + public_data: dict[str, Any], + creator_data: dict[str, Any] + ) -> dict[str, Any]: + lookup_candidates = [ + ("sec_uid", profile.get("sec_uid", "")), + ("douyin_id", profile.get("douyin_id", "")), + ("canonical_profile_url", profile.get("canonical_profile_url", "")) + ] + existing: dict[str, Any] | None = None + for field_name, field_value in lookup_candidates: + if not field_value: + continue + existing = legacy.db.fetch_one( + f"SELECT * FROM douyin_accounts WHERE user_id = ? AND {field_name} = ? LIMIT 1", + (owner["id"], field_value) + ) + if existing: + break + + account_id = existing["id"] if existing else make_id("dyacct") + created_at = existing["created_at"] if existing else now() + updated_at = now() + + tags = _dedupe_strings(profile.get("tags", []) + _extract_keywords(profile.get("nickname", ""), profile.get("signature", ""))) + profile_stats = profile.get("stats", {}) + source_mode = "creator_center" if creator_data["pages"] else "public" + sync_status = "partial" if public_data["errors"] or creator_data["errors"] else "ready" + + if existing: + legacy.db.execute( + """ + UPDATE douyin_accounts + SET profile_url = ?, canonical_profile_url = ?, sec_uid = ?, douyin_uid = ?, douyin_id = ?, + nickname = ?, signature = ?, avatar_url = ?, tags_json = ?, profile_stats_json = ?, + raw_profile_json = ?, source_mode = ?, sync_status = ?, last_public_sync_at = ?, + last_creator_sync_at = ?, updated_at = ? + WHERE id = ? + """, + ( + profile.get("profile_url", ""), + profile.get("canonical_profile_url", ""), + profile.get("sec_uid", ""), + profile.get("douyin_uid", ""), + profile.get("douyin_id", ""), + profile.get("nickname", ""), + profile.get("signature", ""), + profile.get("avatar_url", ""), + _safe_json_dumps(tags), + _safe_json_dumps(profile_stats), + _safe_json_dumps({ + "profile": profile.get("raw", {}), + "discovery_note": sync_request.discovery_note + }), + source_mode, + sync_status, + now() if public_data["raw_pages"] else existing.get("last_public_sync_at"), + now() if creator_data["pages"] else existing.get("last_creator_sync_at"), + updated_at, + account_id + ) + ) + else: + legacy.db.execute( + """ + INSERT INTO douyin_accounts ( + id, user_id, profile_url, canonical_profile_url, sec_uid, douyin_uid, douyin_id, + nickname, signature, avatar_url, tags_json, profile_stats_json, raw_profile_json, + source_mode, sync_status, last_public_sync_at, last_creator_sync_at, created_at, updated_at + ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + """, + ( + account_id, + owner["id"], + profile.get("profile_url", ""), + profile.get("canonical_profile_url", ""), + profile.get("sec_uid", ""), + profile.get("douyin_uid", ""), + profile.get("douyin_id", ""), + profile.get("nickname", ""), + profile.get("signature", ""), + profile.get("avatar_url", ""), + _safe_json_dumps(tags), + _safe_json_dumps(profile_stats), + _safe_json_dumps({ + "profile": profile.get("raw", {}), + "discovery_note": sync_request.discovery_note + }), + source_mode, + sync_status, + now() if public_data["raw_pages"] else None, + now() if creator_data["pages"] else None, + created_at, + updated_at + ) + ) + + account_row = _require_owned_account(account_id, owner["id"]) + _persist_snapshots_and_videos(account_row, public_data, creator_data, sync_request) + return _require_owned_account(account_id, owner["id"]) + + def _persist_snapshot( + account_row: dict[str, Any], + snapshot_type: str, + source_url: str, + payload: Any, + summary: dict[str, Any] + ) -> str: + snapshot_id = make_id("dysnap") + collected_at = now() + fields = _flatten_json(payload) + legacy.db.execute( + """ + INSERT INTO douyin_account_snapshots ( + id, account_id, snapshot_type, source_url, raw_payload_json, summary_json, + field_count, collected_at, created_at + ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?) + """, + ( + snapshot_id, + account_row["id"], + snapshot_type, + source_url, + _safe_json_dumps(payload), + _safe_json_dumps(summary), + len(fields), + collected_at, + collected_at + ) + ) + for field_path, field_type, field_value in fields: + legacy.db.execute( + """ + INSERT OR REPLACE INTO douyin_snapshot_fields ( + snapshot_id, field_path, field_type, field_value_text + ) VALUES (?, ?, ?, ?) + """, + (snapshot_id, field_path, field_type, field_value) + ) + return snapshot_id + + def _persist_snapshots_and_videos( + account_row: dict[str, Any], + public_data: dict[str, Any], + creator_data: dict[str, Any], + sync_request: DouyinAccountSyncRequest + ) -> None: + if public_data["raw_pages"]: + public_payload = { + "pages": public_data["raw_pages"], + "errors": public_data["errors"], + "source_url": public_data["source_url"] + } + _persist_snapshot( + account_row, + "public_profile", + public_data["source_url"], + public_payload, + { + "video_count": len(public_data["videos"]), + "nickname": public_data["profile"].get("nickname", ""), + "tags": public_data["profile"].get("tags", []) + } + ) + + for page in creator_data["pages"]: + payload = { + "title": page["title"], + "blobs": page["blobs"] + } + _persist_snapshot( + account_row, + "creator_center", + page["url"], + payload, + { + "blob_count": len(page["blobs"]), + "field_count": len(_flatten_json(payload)) + } + ) + + for manual_video in sync_request.manual_work_payloads: + normalized = _normalize_video_candidate(manual_video) + public_data["videos"].append(normalized) + + deduped: dict[str, dict[str, Any]] = {} + for video in public_data["videos"]: + key = video["aweme_id"] or video["share_url"] or video["title"] + if key and key not in deduped: + deduped[key] = video + + for video in deduped.values(): + existing = None + if video["aweme_id"]: + existing = legacy.db.fetch_one( + "SELECT id FROM douyin_videos WHERE account_id = ? AND aweme_id = ? LIMIT 1", + (account_row["id"], video["aweme_id"]) + ) + video_id = existing["id"] if existing else make_id("dyvideo") + created_at = now() + if existing: + legacy.db.execute( + """ + UPDATE douyin_videos + SET title = ?, description = ?, share_url = ?, cover_url = ?, duration_sec = ?, + published_at = ?, tags_json = ?, stats_json = ?, raw_json = ?, updated_at = ? + WHERE id = ? + """, + ( + video["title"], + video["description"], + video["share_url"], + video["cover_url"], + video["duration_sec"], + video["published_at"], + _safe_json_dumps(video["tags"]), + _safe_json_dumps(video["stats"]), + _safe_json_dumps(video["raw"]), + now(), + video_id + ) + ) + else: + legacy.db.execute( + """ + INSERT INTO douyin_videos ( + id, account_id, aweme_id, title, description, share_url, cover_url, + duration_sec, published_at, tags_json, stats_json, raw_json, created_at, updated_at + ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + """, + ( + video_id, + account_row["id"], + video["aweme_id"], + video["title"], + video["description"], + video["share_url"], + video["cover_url"], + video["duration_sec"], + video["published_at"], + _safe_json_dumps(video["tags"]), + _safe_json_dumps(video["stats"]), + _safe_json_dumps(video["raw"]), + created_at, + created_at + ) + ) + + def _list_videos(account_id: str, limit: int = 20) -> list[dict[str, Any]]: + rows = legacy.db.fetch_all( + """ + SELECT * + FROM douyin_videos + WHERE account_id = ? + ORDER BY COALESCE(published_at, updated_at) DESC, updated_at DESC + LIMIT ? + """, + (account_id, limit) + ) + payloads: list[dict[str, Any]] = [] + for row in rows: + payloads.append({ + "id": row["id"], + "aweme_id": row["aweme_id"], + "title": row["title"], + "description": row["description"], + "share_url": row["share_url"], + "cover_url": row["cover_url"], + "duration_sec": row["duration_sec"], + "published_at": row["published_at"], + "tags": _safe_json_loads(row["tags_json"], []), + "stats": _safe_json_loads(row["stats_json"], {}), + "raw": _safe_json_loads(row["raw_json"], {}) + }) + return payloads + + def _build_account_payload(account_row: dict[str, Any], include_recent_videos: int = 8) -> dict[str, Any]: + videos = _list_videos(account_row["id"], limit=max(include_recent_videos, 12)) + tags = _safe_json_loads(account_row["tags_json"], []) + profile_stats = _safe_json_loads(account_row["profile_stats_json"], {}) + video_summary = _summarize_videos(videos, limit=include_recent_videos) + keywords = _dedupe_strings( + tags + + _extract_keywords(account_row["nickname"], account_row["signature"]) + + video_summary["top_tags"] + + [video["title"] for video in video_summary["videos"]] + ) + return { + "id": account_row["id"], + "nickname": account_row["nickname"], + "signature": account_row["signature"], + "profile_url": account_row["canonical_profile_url"] or account_row["profile_url"], + "avatar_url": account_row["avatar_url"], + "sec_uid": account_row["sec_uid"], + "douyin_id": account_row["douyin_id"], + "profile_stats": profile_stats, + "tags": tags, + "keywords": keywords[:18], + "sync_status": account_row["sync_status"], + "video_summary": video_summary + } + + def _list_linked_accounts(account_row: dict[str, Any]) -> list[dict[str, Any]]: + relation_rows = legacy.db.fetch_all( + """ + SELECT rel.*, target.nickname AS target_nickname, target.signature AS target_signature, + target.canonical_profile_url AS target_canonical_profile_url, target.profile_stats_json AS target_profile_stats_json, + target.tags_json AS target_tags_json + FROM douyin_account_relations rel + LEFT JOIN douyin_accounts target ON target.id = rel.target_account_id + WHERE rel.source_account_id = ? + ORDER BY rel.created_at DESC + """, + (account_row["id"],) + ) + payloads: list[dict[str, Any]] = [] + for row in relation_rows: + payloads.append({ + "relation_id": row["id"], + "relation_type": row["relation_type"], + "note": row["note"], + "search_id": row["search_id"], + "created_at": row["created_at"], + "target_account_id": row["target_account_id"], + "target_profile_url": row["target_profile_url"] or row.get("target_canonical_profile_url", ""), + "target_nickname": row.get("target_nickname", ""), + "target_signature": row.get("target_signature", ""), + "target_profile_stats": _safe_json_loads(row.get("target_profile_stats_json"), {}), + "target_tags": _safe_json_loads(row.get("target_tags_json"), []) + }) + return payloads + + def _build_workspace_payload(account_row: dict[str, Any]) -> dict[str, Any]: + account_payload = _build_account_payload(account_row) + latest_public_snapshot = legacy.db.fetch_one( + """ + SELECT * + FROM douyin_account_snapshots + WHERE account_id = ? AND snapshot_type = 'public_profile' + ORDER BY collected_at DESC + LIMIT 1 + """, + (account_row["id"],) + ) + latest_creator_snapshot = legacy.db.fetch_one( + """ + SELECT * + FROM douyin_account_snapshots + WHERE account_id = ? AND snapshot_type = 'creator_center' + ORDER BY collected_at DESC + LIMIT 1 + """, + (account_row["id"],) + ) + reports = legacy.db.fetch_all( + """ + SELECT * + FROM douyin_analysis_reports + WHERE account_id = ? + ORDER BY created_at DESC + LIMIT 5 + """, + (account_row["id"],) + ) + report_payloads = [] + for report in reports: + suggestions = legacy.db.fetch_all( + "SELECT * FROM douyin_analysis_suggestions WHERE report_id = ? ORDER BY created_at ASC", + (report["id"],) + ) + report_payloads.append({ + "id": report["id"], + "focus_text": report["focus_text"], + "model_profile_ids": _safe_json_loads(report["model_profile_ids_json"], []), + "linked_account_ids": _safe_json_loads(report["linked_account_ids_json"], []), + "created_at": report["created_at"], + "suggestions": [ + { + "id": suggestion["id"], + "model_profile_id": suggestion["model_profile_id"], + "model_label": suggestion["model_label"], + "status": suggestion["status"], + "suggestion_text": suggestion["suggestion_text"], + "parsed_json": _safe_json_loads(suggestion["parsed_json"], {}) + } + for suggestion in suggestions + ] + }) + recent_searches = legacy.db.fetch_all( + """ + SELECT * + FROM douyin_similarity_searches + WHERE source_account_id = ? + ORDER BY created_at DESC + LIMIT 5 + """, + (account_row["id"],) + ) + return { + "account": account_payload, + "latest_public_snapshot": { + "id": latest_public_snapshot["id"], + "source_url": latest_public_snapshot["source_url"], + "field_count": latest_public_snapshot["field_count"], + "collected_at": latest_public_snapshot["collected_at"], + "summary": _safe_json_loads(latest_public_snapshot["summary_json"], {}) + } if latest_public_snapshot else None, + "latest_creator_snapshot": { + "id": latest_creator_snapshot["id"], + "source_url": latest_creator_snapshot["source_url"], + "field_count": latest_creator_snapshot["field_count"], + "collected_at": latest_creator_snapshot["collected_at"], + "summary": _safe_json_loads(latest_creator_snapshot["summary_json"], {}) + } if latest_creator_snapshot else None, + "linked_accounts": _list_linked_accounts(account_row), + "recent_reports": report_payloads, + "recent_similarity_searches": [ + { + "id": row["id"], + "keywords": _safe_json_loads(row["keywords_json"], []), + "created_at": row["created_at"] + } + for row in recent_searches + ], + "available_model_profiles": [ + { + "id": row["id"], + "name": row["name"], + "model_name": row["model_name"], + "base_url": row["base_url"], + "is_default": bool(row["is_default"]) + } + for row in _fetch_model_profiles(account_row["user_id"]) + ] + } + + def _list_snapshots(account_id: str, limit: int = 20) -> list[dict[str, Any]]: + rows = legacy.db.fetch_all( + """ + SELECT * + FROM douyin_account_snapshots + WHERE account_id = ? + ORDER BY collected_at DESC + LIMIT ? + """, + (account_id, limit) + ) + return [ + { + "id": row["id"], + "snapshot_type": row["snapshot_type"], + "source_url": row["source_url"], + "field_count": row["field_count"], + "collected_at": row["collected_at"], + "summary": _safe_json_loads(row["summary_json"], {}) + } + for row in rows + ] + + def _get_snapshot_detail(snapshot_id: str, account_id: str) -> dict[str, Any]: + row = legacy.db.fetch_one( + """ + SELECT * + FROM douyin_account_snapshots + WHERE id = ? AND account_id = ? + LIMIT 1 + """, + (snapshot_id, account_id) + ) + if not row: + raise HTTPException(status_code=404, detail="Snapshot not found") + fields = legacy.db.fetch_all( + """ + SELECT field_path, field_type, field_value_text + FROM douyin_snapshot_fields + WHERE snapshot_id = ? + ORDER BY field_path ASC + """, + (snapshot_id,) + ) + return { + "id": row["id"], + "snapshot_type": row["snapshot_type"], + "source_url": row["source_url"], + "field_count": row["field_count"], + "collected_at": row["collected_at"], + "summary": _safe_json_loads(row["summary_json"], {}), + "raw_payload": _safe_json_loads(row["raw_payload_json"], {}), + "fields": fields + } + + async def _run_account_analysis( + account_row: dict[str, Any], + owner: dict[str, Any], + request: DouyinAccountAnalysisRequest + ) -> dict[str, Any]: + target_payload = _build_account_payload(account_row, include_recent_videos=max(4, min(request.max_videos, 12))) + linked_rows = _list_linked_accounts(account_row) + linked_account_ids = list(request.linked_account_ids) + if request.include_linked_accounts: + linked_account_ids.extend( + row["target_account_id"] for row in linked_rows if row.get("target_account_id") + ) + linked_account_ids = _dedupe_strings(linked_account_ids) + benchmark_payloads: list[dict[str, Any]] = [] + for linked_account_id in linked_account_ids: + linked_row = _require_owned_account(linked_account_id, owner["id"]) + benchmark_payloads.append(_build_account_payload(linked_row, include_recent_videos=6)) + + if request.include_recent_similar_candidates and not benchmark_payloads: + latest_search = legacy.db.fetch_one( + """ + SELECT * + FROM douyin_similarity_searches + WHERE source_account_id = ? + ORDER BY created_at DESC + LIMIT 1 + """, + (account_row["id"],) + ) + if latest_search: + candidate_rows = legacy.db.fetch_all( + """ + SELECT cand.*, acct.user_id AS account_user_id + FROM douyin_similarity_candidates cand + LEFT JOIN douyin_accounts acct ON acct.id = cand.candidate_account_id + WHERE cand.search_id = ? + ORDER BY cand.rank_index ASC + LIMIT 3 + """, + (latest_search["id"],) + ) + for candidate_row in candidate_rows: + candidate_account_id = candidate_row.get("candidate_account_id") + if not candidate_account_id: + continue + linked_candidate = _require_owned_account(candidate_account_id, owner["id"]) + benchmark_payloads.append(_build_account_payload(linked_candidate, include_recent_videos=6)) + + profiles = _resolve_model_profiles(owner, request.model_profile_ids) + system_prompt = ( + "你是资深抖音增长顾问。你会基于账号画像、创作者中心字段、作品表现和对标账号内容," + "给出可执行的优化建议。请始终返回 JSON 对象,包含这些字段:" + "summary、strengths、weaknesses、benchmark_insights、content_plan、" + "growth_actions、deep_search_hypotheses。每个数组字段请给出 3-6 条中文建议。" + ) + analysis_context = { + "target_account": target_payload, + "benchmark_accounts": benchmark_payloads[:6], + "focus": request.extra_focus, + "creator_center_snapshot_summary": _safe_json_loads( + (legacy.db.fetch_one( + """ + SELECT summary_json + FROM douyin_account_snapshots + WHERE account_id = ? AND snapshot_type = 'creator_center' + ORDER BY collected_at DESC + LIMIT 1 + """, + (account_row["id"],) + ) or {}).get("summary_json"), + {} + ) + } + user_prompt = ( + "请分析以下抖音账号,并分别给出内容方向、选题结构、互动增长、账号定位和对标拆解建议。" + "如果提供了对标账号,要重点指出可借鉴但不应直接照搬的部分。" + f"\n\n输入上下文:\n{json.dumps(analysis_context, ensure_ascii=False, indent=2)}" + ) + + report_id = make_id("dyreport") + created_at = now() + legacy.db.execute( + """ + INSERT INTO douyin_analysis_reports ( + id, account_id, user_id, focus_text, model_profile_ids_json, + linked_account_ids_json, prompt_text, context_json, created_at + ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?) + """, + ( + report_id, + account_row["id"], + owner["id"], + request.extra_focus, + _safe_json_dumps([profile["id"] for profile in profiles]), + _safe_json_dumps(linked_account_ids), + user_prompt, + _safe_json_dumps(analysis_context), + created_at + ) + ) + + async def _analyze_with_model(profile: dict[str, Any]) -> dict[str, Any]: + try: + output = await legacy.call_model( + profile, + system_prompt=system_prompt, + user_prompt=user_prompt, + temperature=request.temperature + ) + parsed = _try_parse_agent_json(output) + status = "ok" + except Exception as exc: + output = str(exc) + parsed = {} + status = "error" + suggestion_id = make_id("dysady") + legacy.db.execute( + """ + INSERT INTO douyin_analysis_suggestions ( + id, report_id, model_profile_id, model_label, status, + suggestion_text, parsed_json, created_at + ) VALUES (?, ?, ?, ?, ?, ?, ?, ?) + """, + ( + suggestion_id, + report_id, + profile["id"], + _build_model_label(profile), + status, + output, + _safe_json_dumps(parsed), + now() + ) + ) + return { + "id": suggestion_id, + "model_profile_id": profile["id"], + "model_label": _build_model_label(profile), + "status": status, + "suggestion_text": output, + "parsed_json": parsed + } + + suggestions = await asyncio.gather(*[_analyze_with_model(profile) for profile in profiles]) + legacy.db.execute( + "UPDATE douyin_accounts SET last_analysis_at = ?, updated_at = ? WHERE id = ?", + (now(), now(), account_row["id"]) + ) + return { + "report_id": report_id, + "created_at": created_at, + "context": analysis_context, + "suggestions": suggestions + } + + async def _prepare_similarity_source( + owner: dict[str, Any], + request: DouyinSimilarSearchRequest + ) -> tuple[dict[str, Any] | None, dict[str, Any]]: + if request.source_account_id: + account_row = _require_owned_account(request.source_account_id, owner["id"]) + return account_row, _build_account_payload(account_row) + + if not (request.profile_url or "").strip(): + raise HTTPException(status_code=400, detail="source_account_id or profile_url is required") + + public_data = await _collect_public_profile(request.profile_url or "", None) + if not public_data["profile"].get("nickname") and not public_data["videos"]: + raise HTTPException(status_code=400, detail="Unable to parse the shared Douyin profile page") + payload = { + "id": "", + "nickname": public_data["profile"].get("nickname", ""), + "signature": public_data["profile"].get("signature", ""), + "profile_url": public_data["profile"].get("canonical_profile_url", "") or request.profile_url, + "avatar_url": public_data["profile"].get("avatar_url", ""), + "sec_uid": public_data["profile"].get("sec_uid", ""), + "douyin_id": public_data["profile"].get("douyin_id", ""), + "profile_stats": public_data["profile"].get("stats", {}), + "tags": public_data["profile"].get("tags", []), + "video_summary": _summarize_videos(public_data["videos"], limit=6) + } + payload["keywords"] = _dedupe_strings( + payload["tags"] + _extract_keywords(payload["nickname"], payload["signature"]) + + payload["video_summary"]["top_tags"] + + [video["title"] for video in payload["video_summary"]["videos"]] + ) + return None, payload + + async def _fetch_or_create_candidate(owner: dict[str, Any], profile_url: str) -> dict[str, Any] | None: + existing = legacy.db.fetch_one( + """ + SELECT * + FROM douyin_accounts + WHERE user_id = ? AND (canonical_profile_url = ? OR profile_url = ?) + LIMIT 1 + """, + (owner["id"], profile_url, profile_url) + ) + if existing: + return existing + + public_data = await _collect_public_profile(profile_url, None) + profile = public_data["profile"] + if not (profile.get("nickname") or public_data["videos"]): + return None + sync_request = DouyinAccountSyncRequest( + profile_url=profile_url, + manual_work_payloads=[video["raw"] for video in public_data["videos"]] + ) + account_row = _upsert_account(owner, profile, sync_request, public_data, {"pages": [], "errors": []}) + return account_row + + async def _run_similarity_search(owner: dict[str, Any], request: DouyinSimilarSearchRequest) -> dict[str, Any]: + source_account_row, source_payload = await _prepare_similarity_source(owner, request) + profile = legacy.model_profile_for_account(owner["id"], request.model_profile_id) + existing_accounts = legacy.db.fetch_all( + """ + SELECT * + FROM douyin_accounts + WHERE user_id = ? + ORDER BY updated_at DESC + """, + (owner["id"],) + ) + + candidate_rows: list[dict[str, Any]] = [] + seen_urls: set[str] = set() + source_id = source_account_row["id"] if source_account_row else "" + for row in existing_accounts: + if row["id"] == source_id: + continue + candidate_rows.append(row) + seen_urls.add(row["canonical_profile_url"] or row["profile_url"]) + + if request.seed_linked_accounts and source_account_row: + for linked in _list_linked_accounts(source_account_row): + candidate_url = linked.get("target_profile_url", "") + if not candidate_url or candidate_url in seen_urls: + continue + seen_urls.add(candidate_url) + if linked.get("target_account_id"): + candidate_rows.append(_require_owned_account(linked["target_account_id"], owner["id"])) + + candidate_urls = _dedupe_strings(request.candidate_urls) + if request.search_public_pages: + discovered = await _discover_profile_urls_from_search(source_payload.get("keywords", []), limit=6) + candidate_urls.extend(discovered) + candidate_urls = _dedupe_strings(candidate_urls) + + for candidate_url in candidate_urls: + if candidate_url in seen_urls or candidate_url == source_payload.get("profile_url"): + continue + candidate_row = await _fetch_or_create_candidate(owner, candidate_url) + if candidate_row: + candidate_rows.append(candidate_row) + seen_urls.add(candidate_url) + + candidate_payloads: list[dict[str, Any]] = [] + seen_account_ids: set[str] = set() + for row in candidate_rows: + if row["id"] in seen_account_ids: + continue + seen_account_ids.add(row["id"]) + payload = _build_account_payload(row, include_recent_videos=6) + payload["heuristics"] = _heuristic_similarity(source_payload, payload) + candidate_payloads.append(payload) + + candidate_payloads.sort(key=lambda item: item["heuristics"]["heuristic_score"], reverse=True) + candidate_payloads = candidate_payloads[: max(3, request.max_candidates)] + + search_id = make_id("dysearch") + prompt_context = { + "source_account": source_payload, + "candidate_accounts": candidate_payloads, + "extra_requirements": request.extra_requirements + } + prompt = ( + "请从候选账号中筛选与目标账号内容风格、题材、受众和互动逻辑最相似,且整体质量更高的账号。" + "请返回 JSON 数组,每项包含 candidate_account_id、candidate_profile_url、score、" + "rationale、similar_dimensions、optimization_value。score 范围 0-100。" + f"\n\n上下文:\n{json.dumps(prompt_context, ensure_ascii=False, indent=2)}" + ) + legacy.db.execute( + """ + INSERT INTO douyin_similarity_searches ( + id, user_id, source_account_id, source_profile_url, keywords_json, + prompt_text, context_json, created_at + ) VALUES (?, ?, ?, ?, ?, ?, ?, ?) + """, + ( + search_id, + owner["id"], + source_account_row["id"] if source_account_row else None, + source_payload.get("profile_url", ""), + _safe_json_dumps(source_payload.get("keywords", [])), + prompt, + _safe_json_dumps(prompt_context), + now() + ) + ) + + if not candidate_payloads: + return { + "search_id": search_id, + "source_account": source_payload, + "model_profile": { + "id": profile["id"], + "label": _build_model_label(profile) + }, + "raw_model_output": "No candidate accounts available. Sync more Douyin accounts or provide candidate_urls.", + "candidates": [] + } + + system_prompt = ( + "你是抖音相似账号发现专家。你要根据内容主题、标签、风格、更新频率、互动表现和商业化潜力," + "挑选最值得对标的账号。返回严格 JSON 数组。" + ) + try: + output = await legacy.call_model(profile, system_prompt=system_prompt, user_prompt=prompt, temperature=0.2) + parsed = _try_parse_agent_json(output) + except Exception as exc: + output = str(exc) + parsed = [] + + candidate_map = { + payload["id"]: payload for payload in candidate_payloads if payload["id"] + } + if isinstance(parsed, dict): + parsed = parsed.get("items") or parsed.get("candidates") or [] + + saved_candidates: list[dict[str, Any]] = [] + if not isinstance(parsed, list) or not parsed: + parsed = [ + { + "candidate_account_id": payload["id"], + "candidate_profile_url": payload["profile_url"], + "score": payload["heuristics"]["heuristic_score"], + "rationale": "Fallback to heuristic similarity because model output was unavailable or unparsable.", + "similar_dimensions": [ + { + "topic_overlap": payload["heuristics"]["topic_overlap"], + "tag_overlap": payload["heuristics"]["tag_overlap"], + "quality_score": payload["heuristics"]["quality_score"] + } + ], + "optimization_value": "可作为候选对标账号进一步人工确认。" + } + for payload in candidate_payloads + ] + + for index, item in enumerate(parsed, start=1): + candidate_account_id = _first_non_empty(item.get("candidate_account_id")) + candidate_profile_url = _first_non_empty(item.get("candidate_profile_url")) + payload = candidate_map.get(candidate_account_id) + if not payload: + payload = next( + (candidate for candidate in candidate_payloads if candidate["profile_url"] == candidate_profile_url), + None + ) + candidate_id = make_id("dycand") + heuristic_score = payload["heuristics"]["heuristic_score"] if payload else 0 + score = _parse_count(item.get("score")) + rationale = _first_non_empty(item.get("rationale"), item.get("reason"), item.get("summary")) + dimensions = item.get("similar_dimensions") or item.get("dimensions") or {} + raw_output = { + "model_output": item, + "candidate_payload": payload or {} + } + legacy.db.execute( + """ + INSERT INTO douyin_similarity_candidates ( + id, search_id, candidate_account_id, candidate_profile_url, heuristic_score, + agent_score, rationale_text, dimensions_json, raw_output_json, rank_index, created_at + ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + """, + ( + candidate_id, + search_id, + payload["id"] if payload else candidate_account_id or None, + payload["profile_url"] if payload else candidate_profile_url, + heuristic_score, + score, + rationale, + _safe_json_dumps(dimensions), + _safe_json_dumps(raw_output), + index, + now() + ) + ) + saved_candidates.append({ + "id": candidate_id, + "candidate_account_id": payload["id"] if payload else candidate_account_id, + "candidate_profile_url": payload["profile_url"] if payload else candidate_profile_url, + "candidate_nickname": payload["nickname"] if payload else "", + "heuristic_score": heuristic_score, + "agent_score": score, + "rationale_text": rationale, + "dimensions": dimensions, + "rank_index": index + }) + + return { + "search_id": search_id, + "source_account": source_payload, + "model_profile": { + "id": profile["id"], + "label": _build_model_label(profile) + }, + "raw_model_output": output, + "candidates": saved_candidates + } + + @app.get("/v2/douyin/accounts") + def list_douyin_accounts(account: dict[str, Any] = Depends(legacy.require_approved)) -> list[dict[str, Any]]: + rows = legacy.db.fetch_all( + """ + SELECT * + FROM douyin_accounts + WHERE user_id = ? + ORDER BY updated_at DESC + """, + (account["id"],) + ) + return [_build_account_payload(row) for row in rows] + + @app.post("/v2/douyin/accounts/sync") + async def sync_douyin_account( + request: DouyinAccountSyncRequest, + account: dict[str, Any] = Depends(legacy.require_approved) + ) -> dict[str, Any]: + if ( + not request.profile_url.strip() + and not request.manual_profile_payload + and not request.manual_creator_pages + ): + raise HTTPException( + status_code=400, + detail="profile_url、manual_profile_payload 或 manual_creator_pages 至少需要传一个" + ) + public_data = await _collect_public_profile(request.profile_url, request.manual_profile_payload) + creator_data = await _collect_creator_center_pages( + request.creator_center_urls, + request.session_cookie, + request.manual_creator_pages + ) + if not public_data["profile"].get("nickname") and not public_data["videos"] and not creator_data["pages"]: + raise HTTPException(status_code=400, detail="No Douyin profile or creator-center data could be extracted") + account_row = _upsert_account(account, public_data["profile"], request, public_data, creator_data) + workspace = _build_workspace_payload(account_row) + workspace["sync_errors"] = public_data["errors"] + creator_data["errors"] + return workspace + + @app.get("/v2/douyin/accounts/{account_id}") + def get_douyin_account( + account_id: str, + account: dict[str, Any] = Depends(legacy.require_approved) + ) -> dict[str, Any]: + account_row = _require_owned_account(account_id, account["id"]) + return _build_workspace_payload(account_row) + + @app.get("/v2/douyin/accounts/{account_id}/snapshots") + def list_douyin_account_snapshots( + account_id: str, + account: dict[str, Any] = Depends(legacy.require_approved) + ) -> list[dict[str, Any]]: + account_row = _require_owned_account(account_id, account["id"]) + return _list_snapshots(account_row["id"]) + + @app.get("/v2/douyin/accounts/{account_id}/snapshots/{snapshot_id}") + def get_douyin_account_snapshot( + account_id: str, + snapshot_id: str, + account: dict[str, Any] = Depends(legacy.require_approved) + ) -> dict[str, Any]: + account_row = _require_owned_account(account_id, account["id"]) + return _get_snapshot_detail(snapshot_id, account_row["id"]) + + @app.get("/v2/douyin/accounts/{account_id}/creator-fields") + def get_douyin_creator_fields( + account_id: str, + account: dict[str, Any] = Depends(legacy.require_approved) + ) -> dict[str, Any]: + account_row = _require_owned_account(account_id, account["id"]) + latest_creator_snapshot = legacy.db.fetch_one( + """ + SELECT id + FROM douyin_account_snapshots + WHERE account_id = ? AND snapshot_type = 'creator_center' + ORDER BY collected_at DESC + LIMIT 1 + """, + (account_row["id"],) + ) + if not latest_creator_snapshot: + raise HTTPException(status_code=404, detail="No creator-center snapshot found") + return _get_snapshot_detail(latest_creator_snapshot["id"], account_row["id"]) + + @app.get("/v2/douyin/accounts/{account_id}/workspace") + def get_douyin_account_workspace( + account_id: str, + account: dict[str, Any] = Depends(legacy.require_approved) + ) -> dict[str, Any]: + account_row = _require_owned_account(account_id, account["id"]) + return _build_workspace_payload(account_row) + + @app.get("/v2/douyin/accounts/{account_id}/analysis-reports") + def list_douyin_analysis_reports( + account_id: str, + account: dict[str, Any] = Depends(legacy.require_approved) + ) -> list[dict[str, Any]]: + account_row = _require_owned_account(account_id, account["id"]) + return _build_workspace_payload(account_row)["recent_reports"] + + @app.post("/v2/douyin/accounts/{account_id}/analysis") + async def analyze_douyin_account( + account_id: str, + request: DouyinAccountAnalysisRequest, + account: dict[str, Any] = Depends(legacy.require_approved) + ) -> dict[str, Any]: + account_row = _require_owned_account(account_id, account["id"]) + return await _run_account_analysis(account_row, account, request) + + @app.post("/v2/douyin/similar-searches") + async def create_douyin_similarity_search( + request: DouyinSimilarSearchRequest, + account: dict[str, Any] = Depends(legacy.require_approved) + ) -> dict[str, Any]: + return await _run_similarity_search(account, request) + + @app.get("/v2/douyin/similar-searches/{search_id}") + def get_douyin_similarity_search( + search_id: str, + account: dict[str, Any] = Depends(legacy.require_approved) + ) -> dict[str, Any]: + search_row = legacy.db.fetch_one( + "SELECT * FROM douyin_similarity_searches WHERE id = ? AND user_id = ?", + (search_id, account["id"]) + ) + if not search_row: + raise HTTPException(status_code=404, detail="Similarity search not found") + candidates = legacy.db.fetch_all( + """ + SELECT cand.*, acct.nickname AS candidate_nickname + FROM douyin_similarity_candidates cand + LEFT JOIN douyin_accounts acct ON acct.id = cand.candidate_account_id + WHERE cand.search_id = ? + ORDER BY cand.rank_index ASC + """, + (search_id,) + ) + return { + "id": search_row["id"], + "source_account_id": search_row["source_account_id"], + "source_profile_url": search_row["source_profile_url"], + "keywords": _safe_json_loads(search_row["keywords_json"], []), + "context": _safe_json_loads(search_row["context_json"], {}), + "created_at": search_row["created_at"], + "candidates": [ + { + "id": row["id"], + "candidate_account_id": row["candidate_account_id"], + "candidate_profile_url": row["candidate_profile_url"], + "candidate_nickname": row.get("candidate_nickname", ""), + "heuristic_score": row["heuristic_score"], + "agent_score": row["agent_score"], + "rationale_text": row["rationale_text"], + "dimensions": _safe_json_loads(row["dimensions_json"], {}), + "rank_index": row["rank_index"] + } + for row in candidates + ] + } + + @app.get("/v2/douyin/accounts/{account_id}/benchmark-links") + def list_douyin_benchmark_links( + account_id: str, + account: dict[str, Any] = Depends(legacy.require_approved) + ) -> list[dict[str, Any]]: + account_row = _require_owned_account(account_id, account["id"]) + return _list_linked_accounts(account_row) + + @app.post("/v2/douyin/accounts/{account_id}/benchmark-links") + def create_douyin_benchmark_links( + account_id: str, + request: DouyinBenchmarkLinkRequest, + account: dict[str, Any] = Depends(legacy.require_approved) + ) -> dict[str, Any]: + account_row = _require_owned_account(account_id, account["id"]) + linked_ids: list[str] = [] + for target_account_id in request.target_account_ids: + target_row = _require_owned_account(target_account_id, account["id"]) + relation_id = make_id("dyrel") + legacy.db.execute( + """ + INSERT INTO douyin_account_relations ( + id, user_id, source_account_id, target_account_id, target_profile_url, + relation_type, note, search_id, created_at + ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?) + """, + ( + relation_id, + account["id"], + account_row["id"], + target_row["id"], + target_row["canonical_profile_url"] or target_row["profile_url"], + request.relation_type, + request.note, + request.search_id, + now() + ) + ) + linked_ids.append(relation_id) + + for target_profile_url in _dedupe_strings(request.target_profile_urls): + relation_id = make_id("dyrel") + legacy.db.execute( + """ + INSERT INTO douyin_account_relations ( + id, user_id, source_account_id, target_account_id, target_profile_url, + relation_type, note, search_id, created_at + ) VALUES (?, ?, ?, NULL, ?, ?, ?, ?, ?) + """, + ( + relation_id, + account["id"], + account_row["id"], + target_profile_url, + request.relation_type, + request.note, + request.search_id, + now() + ) + ) + linked_ids.append(relation_id) + + return { + "saved": len(linked_ids), + "relation_ids": linked_ids, + "links": _list_linked_accounts(account_row) + } diff --git a/collector-service/app/integrations.py b/collector-service/app/integrations.py new file mode 100644 index 0000000..1878bb6 --- /dev/null +++ b/collector-service/app/integrations.py @@ -0,0 +1,217 @@ +from __future__ import annotations + +import mimetypes +from pathlib import Path +from typing import Any + +import httpx + + +def _join_url(base_url: str, path: str) -> str: + base = base_url.rstrip("/") + if path.startswith("http://") or path.startswith("https://"): + return path + return f"{base}/{path.lstrip('/')}" + + +def _unwrap_response(payload: Any) -> dict[str, Any]: + if not isinstance(payload, dict): + return {"value": payload} + if payload.get("success") is True and "data" in payload: + data = payload.get("data") + if isinstance(data, dict): + return data + return {"value": data} + return payload + + +class N8NClient: + def __init__( + self, + *, + base_url: str, + workflow_paths: dict[str, str], + shared_secret: str = "", + timeout: float = 60.0, + ) -> None: + self.base_url = base_url.rstrip("/") + self.workflow_paths = workflow_paths + self.shared_secret = shared_secret.strip() + self.timeout = timeout + + @property + def enabled(self) -> bool: + return bool(self.base_url) + + async def trigger(self, workflow_key: str, payload: dict[str, Any]) -> dict[str, Any]: + workflow_path = self.workflow_paths.get(workflow_key, "").strip() + if not workflow_path: + raise ValueError(f"workflow path not configured for {workflow_key}") + try: + workflow_path = workflow_path.format(**payload) + except KeyError: + pass + headers: dict[str, str] = {} + if self.shared_secret: + headers["X-Orchestrator-Secret"] = self.shared_secret + async with httpx.AsyncClient(timeout=self.timeout) as client: + response = await client.post( + _join_url(self.base_url, workflow_path), + json=payload, + headers=headers, + ) + response.raise_for_status() + if not response.content: + return {"triggered": True} + return _unwrap_response(response.json()) + + +class CutVideoClient: + def __init__( + self, + *, + base_url: str, + api_key: str = "", + timeout: float = 120.0, + upload_timeout: float = 1800.0, + ) -> None: + self.base_url = base_url.rstrip("/") + self.api_key = api_key.strip() + self.timeout = timeout + self.upload_timeout = upload_timeout + + @property + def enabled(self) -> bool: + return bool(self.base_url) + + def _headers(self) -> dict[str, str]: + headers: dict[str, str] = {} + if self.api_key: + headers["Authorization"] = f"Bearer {self.api_key}" + return headers + + async def submit_job(self, payload: dict[str, Any]) -> dict[str, Any]: + async with httpx.AsyncClient(timeout=self.timeout) as client: + response = await client.post( + _join_url(self.base_url, "/api/jobs"), + json=payload, + headers=self._headers(), + ) + response.raise_for_status() + return _unwrap_response(response.json()) + + async def upload_source_file(self, source_path: Path, *, folder_name: str = "") -> dict[str, Any]: + content_type = mimetypes.guess_type(source_path.name)[0] or "application/octet-stream" + headers = self._headers() + data = {"folder_name": folder_name} if folder_name else {} + async with httpx.AsyncClient(timeout=self.upload_timeout) as client: + with source_path.open("rb") as handle: + response = await client.post( + _join_url(self.base_url, "/api/uploads"), + data=data, + files={"files": (source_path.name, handle, content_type)}, + headers=headers, + ) + response.raise_for_status() + return _unwrap_response(response.json()) + + async def get_task(self, task_id: str) -> dict[str, Any]: + async with httpx.AsyncClient(timeout=self.timeout) as client: + response = await client.get( + _join_url(self.base_url, f"/api/tasks/{task_id}"), + headers=self._headers(), + ) + response.raise_for_status() + return _unwrap_response(response.json()) + + async def get_run(self, run_id: str) -> dict[str, Any]: + async with httpx.AsyncClient(timeout=self.timeout) as client: + response = await client.get( + _join_url(self.base_url, f"/api/runs/{run_id}"), + headers=self._headers(), + ) + response.raise_for_status() + return _unwrap_response(response.json()) + + +class AsrHttpClient: + def __init__( + self, + *, + base_url: str, + transcribe_path: str = "/transcribe", + field_name: str = "wav", + timeout: float = 120.0, + ) -> None: + self.base_url = base_url.rstrip("/") + self.transcribe_path = transcribe_path + self.field_name = field_name.strip() or "wav" + self.timeout = timeout + + @property + def enabled(self) -> bool: + return bool(self.base_url) + + async def transcribe_audio(self, audio_path: Path) -> dict[str, Any]: + content_type = mimetypes.guess_type(audio_path.name)[0] or "application/octet-stream" + async with httpx.AsyncClient(timeout=self.timeout) as client: + with audio_path.open("rb") as handle: + response = await client.post( + _join_url(self.base_url, self.transcribe_path), + files={self.field_name: (audio_path.name, handle, content_type)}, + ) + response.raise_for_status() + return _unwrap_response(response.json()) + + +class HuobaoDramaClient: + def __init__(self, *, base_url: str, timeout: float = 180.0) -> None: + self.base_url = base_url.rstrip("/") + self.timeout = timeout + + @property + def enabled(self) -> bool: + return bool(self.base_url) + + async def create_drama(self, payload: dict[str, Any]) -> dict[str, Any]: + async with httpx.AsyncClient(timeout=self.timeout) as client: + response = await client.post( + _join_url(self.base_url, "/api/v1/dramas"), + json=payload, + ) + response.raise_for_status() + return _unwrap_response(response.json()) + + async def generate_image(self, payload: dict[str, Any]) -> dict[str, Any]: + async with httpx.AsyncClient(timeout=self.timeout) as client: + response = await client.post( + _join_url(self.base_url, "/api/v1/images"), + json=payload, + ) + response.raise_for_status() + return _unwrap_response(response.json()) + + async def get_image(self, image_id: str) -> dict[str, Any]: + async with httpx.AsyncClient(timeout=self.timeout) as client: + response = await client.get( + _join_url(self.base_url, f"/api/v1/images/{image_id}"), + ) + response.raise_for_status() + return _unwrap_response(response.json()) + + async def generate_video(self, payload: dict[str, Any]) -> dict[str, Any]: + async with httpx.AsyncClient(timeout=self.timeout) as client: + response = await client.post( + _join_url(self.base_url, "/api/v1/videos"), + json=payload, + ) + response.raise_for_status() + return _unwrap_response(response.json()) + + async def get_video(self, video_id: str) -> dict[str, Any]: + async with httpx.AsyncClient(timeout=self.timeout) as client: + response = await client.get( + _join_url(self.base_url, f"/api/v1/videos/{video_id}"), + ) + response.raise_for_status() + return _unwrap_response(response.json()) diff --git a/collector-service/app/legacy_runtime.py b/collector-service/app/legacy_runtime.py new file mode 100644 index 0000000..dcb1e84 --- /dev/null +++ b/collector-service/app/legacy_runtime.py @@ -0,0 +1,68 @@ +from __future__ import annotations + +import importlib.machinery +import importlib.util +import sys +import types +from pathlib import Path +from typing import Any + +BASE_DIR = Path(__file__).resolve().parent +PYCACHE_DIR = BASE_DIR / "__pycache__" +LEGACY_PYC_DIR = BASE_DIR / "_legacy_pyc" +SUPPORTED_PYTHON = (3, 11) + +_LEGACY_MODULE: Any | None = None + + +def _ensure_supported_runtime() -> None: + if sys.version_info[:2] != SUPPORTED_PYTHON: + version = ".".join(map(str, sys.version_info[:3])) + required = ".".join(map(str, SUPPORTED_PYTHON)) + raise RuntimeError( + f"Legacy collector bytecode requires Python {required}. Current runtime: {version}." + ) + + +def _ensure_package() -> None: + package = sys.modules.get("app") + if package is None: + package = types.ModuleType("app") + package.__path__ = [str(BASE_DIR)] + sys.modules["app"] = package + + +def _load_sourceless_module(module_name: str, pyc_path: Path) -> Any: + loader = importlib.machinery.SourcelessFileLoader(module_name, str(pyc_path)) + spec = importlib.util.spec_from_loader(module_name, loader) + if spec is None: + raise RuntimeError(f"Unable to create spec for {module_name}") + module = importlib.util.module_from_spec(spec) + sys.modules[module_name] = module + loader.exec_module(module) + return module + + +def load_legacy_main() -> Any: + global _LEGACY_MODULE + if _LEGACY_MODULE is not None: + return _LEGACY_MODULE + + _ensure_supported_runtime() + _ensure_package() + + for name in ("database", "fastgpt", "openai_compat"): + full_name = f"app.{name}" + if full_name not in sys.modules: + pyc_dir = LEGACY_PYC_DIR if (LEGACY_PYC_DIR / f"{name}.cpython-311.pyc").exists() else PYCACHE_DIR + _load_sourceless_module(full_name, pyc_dir / f"{name}.cpython-311.pyc") + + legacy_name = "app.main_legacy" + if legacy_name in sys.modules: + _LEGACY_MODULE = sys.modules[legacy_name] + return _LEGACY_MODULE + + main_pyc_dir = LEGACY_PYC_DIR if (LEGACY_PYC_DIR / "main.cpython-311.pyc").exists() else PYCACHE_DIR + _LEGACY_MODULE = _load_sourceless_module(legacy_name, main_pyc_dir / "main.cpython-311.pyc") + _LEGACY_MODULE.__package__ = "app" + return _LEGACY_MODULE diff --git a/collector-service/app/main.py b/collector-service/app/main.py new file mode 100644 index 0000000..88f12db --- /dev/null +++ b/collector-service/app/main.py @@ -0,0 +1,16 @@ +from __future__ import annotations + +from .douyin_features import register_douyin_routes + +try: + from . import core_main as core +except Exception: + # Keep a bytecode-backed fallback so the app can still boot if the + # recovered source baseline is incomplete in this workspace. + from .legacy_runtime import load_legacy_main + + core = load_legacy_main() + +app = core.app + +register_douyin_routes(app, core) diff --git a/collector-service/app/openai_compat.py b/collector-service/app/openai_compat.py new file mode 100644 index 0000000..3567fa8 --- /dev/null +++ b/collector-service/app/openai_compat.py @@ -0,0 +1,45 @@ +from __future__ import annotations + +from typing import Any + +import httpx + + +class OpenAICompatClient: + def __init__(self, timeout: float = 180.0) -> None: + self.timeout = timeout + + async def chat_completion( + self, + *, + base_url: str, + api_key: str, + model: str, + system_prompt: str, + user_prompt: str, + temperature: float = 0.7, + ) -> str: + url = base_url.rstrip("/") + "/chat/completions" + headers = {"Content-Type": "application/json"} + if api_key.strip(): + headers["Authorization"] = f"Bearer {api_key.strip()}" + payload: dict[str, Any] = { + "model": model, + "temperature": temperature, + "messages": [ + {"role": "system", "content": system_prompt}, + {"role": "user", "content": user_prompt}, + ], + } + async with httpx.AsyncClient(timeout=self.timeout) as client: + response = await client.post(url, headers=headers, json=payload) + response.raise_for_status() + data = response.json() + choices = data.get("choices") or [] + if not choices: + return "" + message = choices[0].get("message") or {} + content = message.get("content") or "" + if isinstance(content, list): + return "\n".join(str(item.get("text", "")) for item in content if isinstance(item, dict)).strip() + return str(content).strip() diff --git a/collector-service/run_source_overlay.sh b/collector-service/run_source_overlay.sh new file mode 100755 index 0000000..c132678 --- /dev/null +++ b/collector-service/run_source_overlay.sh @@ -0,0 +1,41 @@ +#!/usr/bin/env bash +set -euo pipefail + +ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +PORT="${PORT:-18083}" +HOST="${HOST:-127.0.0.1}" + +# Mirror the current live collector runtime so we can verify the source overlay +# against the same database and external integrations without touching 8081. +export DATA_DIR="${DATA_DIR:-/Users/kris/code/StoryForge-gitea/data/collector}" +export DATABASE_PATH="${DATABASE_PATH:-$DATA_DIR/storyforge.db}" +export DEFAULT_EXTERNAL_BASE_URL="${DEFAULT_EXTERNAL_BASE_URL:-https://test.hyzq.net/storyforge}" +export LOCAL_OPENAI_BASE_URL="${LOCAL_OPENAI_BASE_URL:-http://host.docker.internal:8317/v1}" +export LOCAL_OPENAI_MODEL="${LOCAL_OPENAI_MODEL:-GLM-5}" +export LOCAL_OPENAI_API_KEY="${LOCAL_OPENAI_API_KEY:-}" +export YTDLP_BIN="${YTDLP_BIN:-yt-dlp}" +export FFMPEG_BIN="${FFMPEG_BIN:-ffmpeg}" +export WHISPER_BIN="${WHISPER_BIN:-}" +export WHISPER_MODEL="${WHISPER_MODEL:-$DATA_DIR/models/ggml-base.en.bin}" +export ASR_HTTP_BASE_URL="${ASR_HTTP_BASE_URL:-http://host.docker.internal:8088}" +export ASR_HTTP_TRANSCRIBE_PATH="${ASR_HTTP_TRANSCRIBE_PATH:-/transcribe}" +export ASR_HTTP_FIELD_NAME="${ASR_HTTP_FIELD_NAME:-wav}" +export ASR_HTTP_TIMEOUT_SEC="${ASR_HTTP_TIMEOUT_SEC:-120}" +export N8N_BASE_URL="${N8N_BASE_URL:-http://n8n:5678}" +export N8N_ANALYSIS_WEBHOOK_PATH="${N8N_ANALYSIS_WEBHOOK_PATH:-/webhook/storyforge-analysis}" +export N8N_REAL_CUT_WEBHOOK_PATH="${N8N_REAL_CUT_WEBHOOK_PATH:-/webhook/storyforge-real-cut}" +export N8N_AI_VIDEO_WEBHOOK_PATH="${N8N_AI_VIDEO_WEBHOOK_PATH:-/webhook/storyforge-ai-video}" +export N8N_CONTENT_SOURCE_SYNC_WEBHOOK_PATH="${N8N_CONTENT_SOURCE_SYNC_WEBHOOK_PATH:-/webhook/storyforge-content-source-sync}" +export ORCHESTRATOR_SHARED_SECRET="${ORCHESTRATOR_SHARED_SECRET:-storyforge-local-secret}" +export CUTVIDEO_BASE_URL="${CUTVIDEO_BASE_URL:-http://192.168.31.18:7860}" +export CUTVIDEO_API_KEY="${CUTVIDEO_API_KEY:-}" +export CUTVIDEO_BASE_CONFIG="${CUTVIDEO_BASE_CONFIG:-example.job.yaml}" +export CUTVIDEO_POLL_INTERVAL_SEC="${CUTVIDEO_POLL_INTERVAL_SEC:-10}" +export CUTVIDEO_MAX_WAIT_SEC="${CUTVIDEO_MAX_WAIT_SEC:-1800}" +export CUTVIDEO_UPLOAD_TIMEOUT_SEC="${CUTVIDEO_UPLOAD_TIMEOUT_SEC:-1800}" +export HUOBAO_BASE_URL="${HUOBAO_BASE_URL:-http://host.docker.internal:5678}" +export HUOBAO_POLL_INTERVAL_SEC="${HUOBAO_POLL_INTERVAL_SEC:-10}" +export HUOBAO_MAX_WAIT_SEC="${HUOBAO_MAX_WAIT_SEC:-900}" + +cd "$ROOT_DIR" +exec ./.venv311/bin/python -m uvicorn app.main:app --host "$HOST" --port "$PORT"