Files
momentry_core/scripts/regenerate_parent_5w1h.py
Warren e75c4d6f07 cleanup: remove dead code and duplicate docs
- Remove session-ses_2f27.md (161KB raw session log)
- Remove 49 ROOT_* duplicate files across REFERENCE/
- Remove 14 duplicate files between REFERENCE/ root and history/
- Remove asr_legacy.rs (dead code, replaced by asr.rs)
- Remove src/core/worker/ (duplicate JobWorker)
- Remove src/core/layers/ (empty directory)
- Remove 4 .bak files in src/
- Remove 7 dead private methods in worker/processor.rs
- Remove backup directory from git tracking
2026-05-04 01:31:21 +08:00

198 lines
6.2 KiB
Python

#!/opt/homebrew/bin/python3.11
"""
Regenerate parent chunk summaries using 5W1H multi-dimensional structure via gemma4.
5W1H Structure:
- Who: Main characters/people involved
- What: Key actions/events
- When: Temporal context (sequence in story)
- Where: Location/setting
- Why: Motivation/conflict driving the scene
- How: Emotional tone/manner of events
"""
import json
import requests
import psycopg2
import psycopg2.extras
DB_CONFIG = {"host": "localhost", "user": "accusys", "dbname": "momentry"}
UUID = "384b0ff44aaaa1f1"
LLAMA_URL = "http://127.0.0.1:8081/v1/chat/completions"
def get_parent_with_children():
"""Get all parent chunks with their child chunk texts"""
conn = psycopg2.connect(**DB_CONFIG)
cur = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
cur.execute(
"""
SELECT pc.id, pc.scene_order, pc.start_time, pc.end_time,
pc.start_frame, pc.end_frame, pc.fps, pc.summary_text as old_summary,
pc.metadata,
ARRAY_AGG(c.text_content ORDER BY c.start_time) as child_texts
FROM parent_chunks pc
LEFT JOIN chunks c ON c.parent_chunk_id = pc.id::varchar
WHERE pc.uuid = %s
GROUP BY pc.id, pc.scene_order, pc.start_time, pc.end_time,
pc.start_frame, pc.end_frame, pc.fps, pc.summary_text, pc.metadata
ORDER BY pc.scene_order
""",
(UUID,),
)
parents = cur.fetchall()
cur.close()
conn.close()
return parents
def call_gemma4(prompt, max_tokens=1500):
"""Call Gemma4 via llama-server OpenAI-compatible API"""
payload = {
"messages": [{"role": "user", "content": prompt}],
"max_tokens": max_tokens,
"temperature": 0.3,
"min_p": 0.1,
}
try:
resp = requests.post(LLAMA_URL, json=payload, timeout=180)
if resp.status_code == 200:
result = resp.json()
content = (
result.get("choices", [{}])[0]
.get("message", {})
.get("content", "")
.strip()
)
return content
except Exception as e:
print(f" ⚠️ llama-server error: {e}")
return ""
def generate_5w1h_summary(parent, scene_num):
"""Generate 5W1H structured summary using gemma4"""
texts = [t for t in (parent["child_texts"] or []) if t]
if not texts:
return None
# Use only first 3 and last 3 dialogue lines for context (much faster)
sample_texts = texts[:3] + ["..."] + texts[-3:] if len(texts) > 6 else texts
combined = "\n".join(sample_texts)[:1500]
duration = parent["end_time"] - parent["start_time"]
prompt = f"""You are a film scene analyst. Analyze this scene and provide 5W1H analysis.
Scene {scene_num}/17 | {duration:.0f}s | {len(texts)} dialogue lines
Key dialogue:
{combined}
Respond with ONLY this JSON:
{{"summary_5lines":"...","who":"...","what":"...","when":"...","where":"...","why":"...","how":"...","characters":[],"tone":[],"key_events":[]}}
IMPORTANT: "summary_5lines" must be EXACTLY 5 lines describing the scene. Each line should be a complete sentence separated by \\n."""
response = call_gemma4(prompt, max_tokens=2000)
if not response:
return None
# Simple JSON extraction: find first { and last }
try:
start = response.find("{")
end = response.rfind("}") + 1
if start >= 0 and end > start:
return json.loads(response[start:end])
except Exception:
pass
return None
def update_parent_chunk(parent, analysis):
"""Update parent chunk with 5W1H structured data"""
if not analysis:
return False
conn = psycopg2.connect(**DB_CONFIG)
cur = conn.cursor()
# Create structured summary text (5 lines)
structured_text = f"{analysis.get('summary_5lines', '')}"
# Update metadata with full 5W1H structure
metadata = parent["metadata"] if parent["metadata"] else {}
metadata["auto_generated_by"] = "gemma4"
metadata["chunk_count"] = len(parent["child_texts"] or [])
metadata["structured_summary"] = {
"summary_5lines": analysis.get("summary_5lines", ""),
"who": analysis.get("who", ""),
"what": analysis.get("what", ""),
"when": analysis.get("when", ""),
"where": analysis.get("where", ""),
"why": analysis.get("why", ""),
"how": analysis.get("how", ""),
"characters": analysis.get("characters", []),
"tone": analysis.get("tone", []),
"key_events": analysis.get("key_events", []),
}
cur.execute(
"""
UPDATE parent_chunks
SET summary_text = %s,
metadata = %s::jsonb
WHERE id = %s
""",
(structured_text, json.dumps(metadata, ensure_ascii=False), parent["id"]),
)
conn.commit()
cur.close()
conn.close()
return True
def main():
print(f"🎬 Regenerating 5W1H summaries for {UUID}")
print(f" Using llama.cpp server at {LLAMA_URL}")
print("=" * 70)
parents = get_parent_with_children()
print(f"📥 Found {len(parents)} parent chunks")
success_count = 0
for i, parent in enumerate(parents):
duration = parent["end_time"] - parent["start_time"]
text_count = len(parent["child_texts"] or [])
print(
f"\n🎬 Scene {parent['scene_order']}: {parent['start_time']:.0f}s-{parent['end_time']:.0f}s ({duration:.0f}s, {text_count} chunks)"
)
if parent["old_summary"]:
print(f" Old: {parent['old_summary'][:80]}...")
analysis = generate_5w1h_summary(parent, parent["scene_order"])
if analysis:
summary = analysis.get("summary_5lines", "N/A")
print(f" ✅ Summary: {summary[:100]}...")
print(f" 👤 Who: {analysis.get('who', 'N/A')[:60]}")
print(f" 📍 Where: {analysis.get('where', 'N/A')[:60]}")
print(f" 💡 Why: {analysis.get('why', 'N/A')[:60]}")
if update_parent_chunk(parent, analysis):
success_count += 1
else:
print(" ❌ Failed to generate analysis")
print(f"\n{'=' * 70}")
print(
f"✅ Updated {success_count}/{len(parents)} parent chunks with 5W1H summaries"
)
if __name__ == "__main__":
main()