From be94836e3c690e3509c1819c4a19d4ff76bc4431 Mon Sep 17 00:00:00 2001 From: kris Date: Sat, 21 Mar 2026 02:26:42 +0800 Subject: [PATCH] fix: collapse duplicate douyin analysis history --- collector-service/app/douyin_features.py | 173 ++++++++++++++---- .../douyin-browser-capture/control_panel.mjs | 1 + 2 files changed, 140 insertions(+), 34 deletions(-) diff --git a/collector-service/app/douyin_features.py b/collector-service/app/douyin_features.py index 1900cfa..69e07f2 100644 --- a/collector-service/app/douyin_features.py +++ b/collector-service/app/douyin_features.py @@ -1598,6 +1598,127 @@ def register_douyin_routes(app: Any, legacy: Any) -> None: }) return payloads + def _normalize_report_text(value: Any) -> str: + text = str(value or "").strip() + if not text: + return "" + return re.sub(r"\s+", " ", text) + + def _build_report_payload(report: dict[str, Any]) -> dict[str, Any]: + suggestions = legacy.db.fetch_all( + "SELECT * FROM douyin_analysis_suggestions WHERE report_id = ? ORDER BY created_at ASC", + (report["id"],) + ) + return { + "id": report["id"], + "focus_text": report["focus_text"], + "model_profile_ids": _safe_json_loads(report["model_profile_ids_json"], []), + "linked_account_ids": _safe_json_loads(report["linked_account_ids_json"], []), + "created_at": report["created_at"], + "duplicate_count": 1, + "duplicate_report_ids": [], + "suggestions": [ + { + "id": suggestion["id"], + "model_profile_id": suggestion["model_profile_id"], + "model_label": suggestion["model_label"], + "status": suggestion["status"], + "suggestion_text": suggestion["suggestion_text"], + "parsed_json": _safe_json_loads(suggestion["parsed_json"], {}) + } + for suggestion in suggestions + ] + } + + def _report_signature(report_payload: dict[str, Any]) -> str: + parts = [_normalize_report_text(report_payload.get("focus_text"))] + for suggestion in report_payload.get("suggestions", []): + parsed = suggestion.get("parsed_json") or {} + if isinstance(parsed, dict) and parsed: + normalized_content = json.dumps(parsed, ensure_ascii=False, sort_keys=True) + else: + normalized_content = _normalize_report_text(suggestion.get("suggestion_text")) + parts.append( + "|".join( + [ + suggestion.get("model_profile_id", ""), + suggestion.get("status", ""), + normalized_content + ] + ) + ) + return "\n".join(parts) + + def _list_report_payloads(account_id: str, limit: int = 5, dedupe: bool = True) -> list[dict[str, Any]]: + rows = legacy.db.fetch_all( + """ + SELECT * + FROM douyin_analysis_reports + WHERE account_id = ? + ORDER BY created_at DESC + LIMIT ? + """, + (account_id, max(limit * 4, 20)) + ) + payloads = [_build_report_payload(report) for report in rows] + if not dedupe: + return payloads[:limit] + + unique_payloads: list[dict[str, Any]] = [] + seen: dict[str, dict[str, Any]] = {} + for payload in payloads: + signature = _report_signature(payload) + if signature in seen: + seen_payload = seen[signature] + seen_payload["duplicate_count"] = int(seen_payload.get("duplicate_count") or 1) + 1 + seen_payload.setdefault("duplicate_report_ids", []).append(payload["id"]) + continue + seen[signature] = payload + unique_payloads.append(payload) + focus_filtered: list[dict[str, Any]] = [] + focus_seen: dict[str, dict[str, Any]] = {} + for payload in unique_payloads: + focus_key = _normalize_report_text(payload.get("focus_text") or "__default__") + if focus_key in focus_seen: + seen_payload = focus_seen[focus_key] + seen_payload["duplicate_count"] = int(seen_payload.get("duplicate_count") or 1) + 1 + seen_payload.setdefault("duplicate_report_ids", []).append(payload["id"]) + continue + focus_seen[focus_key] = payload + focus_filtered.append(payload) + return focus_filtered[:limit] + + def _delete_report(report_id: str) -> None: + legacy.db.execute("DELETE FROM douyin_analysis_suggestions WHERE report_id = ?", (report_id,)) + legacy.db.execute("DELETE FROM douyin_analysis_reports WHERE id = ?", (report_id,)) + + def _find_duplicate_report_payload( + account_id: str, + focus_text: str, + suggestion_payloads: list[dict[str, Any]], + exclude_report_id: str = "" + ) -> dict[str, Any] | None: + candidate_rows = legacy.db.fetch_all( + """ + SELECT * + FROM douyin_analysis_reports + WHERE account_id = ? AND focus_text = ? AND id != ? + ORDER BY created_at DESC + LIMIT 10 + """, + (account_id, focus_text, exclude_report_id) + ) + probe_payload = { + "focus_text": focus_text, + "suggestions": suggestion_payloads + } + probe_signature = _report_signature(probe_payload) + for row in candidate_rows: + candidate_payload = _build_report_payload(row) + if _report_signature(candidate_payload) == probe_signature: + return candidate_payload + return None + def _build_workspace_payload(account_row: dict[str, Any]) -> dict[str, Any]: account_payload = _build_account_payload(account_row) video_workspace = _build_video_workspace_payload(account_row) @@ -1621,40 +1742,7 @@ def register_douyin_routes(app: Any, legacy: Any) -> None: """, (account_row["id"],) ) - reports = legacy.db.fetch_all( - """ - SELECT * - FROM douyin_analysis_reports - WHERE account_id = ? - ORDER BY created_at DESC - LIMIT 5 - """, - (account_row["id"],) - ) - report_payloads = [] - for report in reports: - suggestions = legacy.db.fetch_all( - "SELECT * FROM douyin_analysis_suggestions WHERE report_id = ? ORDER BY created_at ASC", - (report["id"],) - ) - report_payloads.append({ - "id": report["id"], - "focus_text": report["focus_text"], - "model_profile_ids": _safe_json_loads(report["model_profile_ids_json"], []), - "linked_account_ids": _safe_json_loads(report["linked_account_ids_json"], []), - "created_at": report["created_at"], - "suggestions": [ - { - "id": suggestion["id"], - "model_profile_id": suggestion["model_profile_id"], - "model_label": suggestion["model_label"], - "status": suggestion["status"], - "suggestion_text": suggestion["suggestion_text"], - "parsed_json": _safe_json_loads(suggestion["parsed_json"], {}) - } - for suggestion in suggestions - ] - }) + report_payloads = _list_report_payloads(account_row["id"], limit=5, dedupe=True) recent_searches = legacy.db.fetch_all( """ SELECT * @@ -2335,6 +2423,23 @@ def register_douyin_routes(app: Any, legacy: Any) -> None: } suggestions = await asyncio.gather(*[_analyze_with_model(profile) for profile in profiles]) + duplicate_report = _find_duplicate_report_payload( + account_row["id"], + request.extra_focus, + suggestions, + exclude_report_id=report_id + ) + if duplicate_report: + _delete_report(report_id) + return { + "report_id": duplicate_report["id"], + "created_at": duplicate_report["created_at"], + "context": analysis_context, + "suggestions": duplicate_report["suggestions"], + "auto_video_analyses": [], + "duplicate_of_report_id": duplicate_report["id"], + "duplicate_count": duplicate_report.get("duplicate_count", 1) + } auto_video_analyses: list[dict[str, Any]] = [] if request.auto_analyze_top_videos and profiles: auto_video_analyses = await _run_top_video_analyses( diff --git a/scripts/douyin-browser-capture/control_panel.mjs b/scripts/douyin-browser-capture/control_panel.mjs index 7fb9184..ef91a3d 100644 --- a/scripts/douyin-browser-capture/control_panel.mjs +++ b/scripts/douyin-browser-capture/control_panel.mjs @@ -1505,6 +1505,7 @@ function renderPage() { '' + escapeHtml(report.focus_text || "默认分析") + '', '' + escapeHtml(formatDateTime(report.created_at)) + '', '', + Number(report.duplicate_count || 1) > 1 ? '
已折叠 ' + escapeHtml(String(Number(report.duplicate_count) - 1)) + ' 条同主题历史
' : '', safeArray(report.suggestions).length ? safeArray(report.suggestions).map(renderAccountSuggestion).join("") : '

这份报告还没有 suggestion。

', '' ].join("");