- Remove session-ses_2f27.md (161KB raw session log) - Remove 49 ROOT_* duplicate files across REFERENCE/ - Remove 14 duplicate files between REFERENCE/ root and history/ - Remove asr_legacy.rs (dead code, replaced by asr.rs) - Remove src/core/worker/ (duplicate JobWorker) - Remove src/core/layers/ (empty directory) - Remove 4 .bak files in src/ - Remove 7 dead private methods in worker/processor.rs - Remove backup directory from git tracking
198 lines
6.2 KiB
Python
198 lines
6.2 KiB
Python
#!/opt/homebrew/bin/python3.11
|
|
"""
|
|
Regenerate parent chunk summaries using 5W1H multi-dimensional structure via gemma4.
|
|
|
|
5W1H Structure:
|
|
- Who: Main characters/people involved
|
|
- What: Key actions/events
|
|
- When: Temporal context (sequence in story)
|
|
- Where: Location/setting
|
|
- Why: Motivation/conflict driving the scene
|
|
- How: Emotional tone/manner of events
|
|
"""
|
|
|
|
import json
|
|
import requests
|
|
import psycopg2
|
|
import psycopg2.extras
|
|
|
|
DB_CONFIG = {"host": "localhost", "user": "accusys", "dbname": "momentry"}
|
|
UUID = "384b0ff44aaaa1f1"
|
|
LLAMA_URL = "http://127.0.0.1:8081/v1/chat/completions"
|
|
|
|
|
|
def get_parent_with_children():
|
|
"""Get all parent chunks with their child chunk texts"""
|
|
conn = psycopg2.connect(**DB_CONFIG)
|
|
cur = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
|
|
|
|
cur.execute(
|
|
"""
|
|
SELECT pc.id, pc.scene_order, pc.start_time, pc.end_time,
|
|
pc.start_frame, pc.end_frame, pc.fps, pc.summary_text as old_summary,
|
|
pc.metadata,
|
|
ARRAY_AGG(c.text_content ORDER BY c.start_time) as child_texts
|
|
FROM parent_chunks pc
|
|
LEFT JOIN chunks c ON c.parent_chunk_id = pc.id::varchar
|
|
WHERE pc.uuid = %s
|
|
GROUP BY pc.id, pc.scene_order, pc.start_time, pc.end_time,
|
|
pc.start_frame, pc.end_frame, pc.fps, pc.summary_text, pc.metadata
|
|
ORDER BY pc.scene_order
|
|
""",
|
|
(UUID,),
|
|
)
|
|
|
|
parents = cur.fetchall()
|
|
cur.close()
|
|
conn.close()
|
|
return parents
|
|
|
|
|
|
def call_gemma4(prompt, max_tokens=1500):
|
|
"""Call Gemma4 via llama-server OpenAI-compatible API"""
|
|
payload = {
|
|
"messages": [{"role": "user", "content": prompt}],
|
|
"max_tokens": max_tokens,
|
|
"temperature": 0.3,
|
|
"min_p": 0.1,
|
|
}
|
|
try:
|
|
resp = requests.post(LLAMA_URL, json=payload, timeout=180)
|
|
if resp.status_code == 200:
|
|
result = resp.json()
|
|
content = (
|
|
result.get("choices", [{}])[0]
|
|
.get("message", {})
|
|
.get("content", "")
|
|
.strip()
|
|
)
|
|
return content
|
|
except Exception as e:
|
|
print(f" ⚠️ llama-server error: {e}")
|
|
return ""
|
|
|
|
|
|
def generate_5w1h_summary(parent, scene_num):
|
|
"""Generate 5W1H structured summary using gemma4"""
|
|
texts = [t for t in (parent["child_texts"] or []) if t]
|
|
if not texts:
|
|
return None
|
|
|
|
# Use only first 3 and last 3 dialogue lines for context (much faster)
|
|
sample_texts = texts[:3] + ["..."] + texts[-3:] if len(texts) > 6 else texts
|
|
combined = "\n".join(sample_texts)[:1500]
|
|
duration = parent["end_time"] - parent["start_time"]
|
|
|
|
prompt = f"""You are a film scene analyst. Analyze this scene and provide 5W1H analysis.
|
|
|
|
Scene {scene_num}/17 | {duration:.0f}s | {len(texts)} dialogue lines
|
|
|
|
Key dialogue:
|
|
{combined}
|
|
|
|
Respond with ONLY this JSON:
|
|
{{"summary_5lines":"...","who":"...","what":"...","when":"...","where":"...","why":"...","how":"...","characters":[],"tone":[],"key_events":[]}}
|
|
IMPORTANT: "summary_5lines" must be EXACTLY 5 lines describing the scene. Each line should be a complete sentence separated by \\n."""
|
|
|
|
response = call_gemma4(prompt, max_tokens=2000)
|
|
|
|
if not response:
|
|
return None
|
|
|
|
# Simple JSON extraction: find first { and last }
|
|
try:
|
|
start = response.find("{")
|
|
end = response.rfind("}") + 1
|
|
if start >= 0 and end > start:
|
|
return json.loads(response[start:end])
|
|
except Exception:
|
|
pass
|
|
|
|
return None
|
|
|
|
|
|
def update_parent_chunk(parent, analysis):
|
|
"""Update parent chunk with 5W1H structured data"""
|
|
if not analysis:
|
|
return False
|
|
|
|
conn = psycopg2.connect(**DB_CONFIG)
|
|
cur = conn.cursor()
|
|
|
|
# Create structured summary text (5 lines)
|
|
structured_text = f"{analysis.get('summary_5lines', '')}"
|
|
|
|
# Update metadata with full 5W1H structure
|
|
metadata = parent["metadata"] if parent["metadata"] else {}
|
|
metadata["auto_generated_by"] = "gemma4"
|
|
metadata["chunk_count"] = len(parent["child_texts"] or [])
|
|
metadata["structured_summary"] = {
|
|
"summary_5lines": analysis.get("summary_5lines", ""),
|
|
"who": analysis.get("who", ""),
|
|
"what": analysis.get("what", ""),
|
|
"when": analysis.get("when", ""),
|
|
"where": analysis.get("where", ""),
|
|
"why": analysis.get("why", ""),
|
|
"how": analysis.get("how", ""),
|
|
"characters": analysis.get("characters", []),
|
|
"tone": analysis.get("tone", []),
|
|
"key_events": analysis.get("key_events", []),
|
|
}
|
|
|
|
cur.execute(
|
|
"""
|
|
UPDATE parent_chunks
|
|
SET summary_text = %s,
|
|
metadata = %s::jsonb
|
|
WHERE id = %s
|
|
""",
|
|
(structured_text, json.dumps(metadata, ensure_ascii=False), parent["id"]),
|
|
)
|
|
|
|
conn.commit()
|
|
cur.close()
|
|
conn.close()
|
|
return True
|
|
|
|
|
|
def main():
|
|
print(f"🎬 Regenerating 5W1H summaries for {UUID}")
|
|
print(f" Using llama.cpp server at {LLAMA_URL}")
|
|
print("=" * 70)
|
|
|
|
parents = get_parent_with_children()
|
|
print(f"📥 Found {len(parents)} parent chunks")
|
|
|
|
success_count = 0
|
|
for i, parent in enumerate(parents):
|
|
duration = parent["end_time"] - parent["start_time"]
|
|
text_count = len(parent["child_texts"] or [])
|
|
print(
|
|
f"\n🎬 Scene {parent['scene_order']}: {parent['start_time']:.0f}s-{parent['end_time']:.0f}s ({duration:.0f}s, {text_count} chunks)"
|
|
)
|
|
if parent["old_summary"]:
|
|
print(f" Old: {parent['old_summary'][:80]}...")
|
|
|
|
analysis = generate_5w1h_summary(parent, parent["scene_order"])
|
|
|
|
if analysis:
|
|
summary = analysis.get("summary_5lines", "N/A")
|
|
print(f" ✅ Summary: {summary[:100]}...")
|
|
print(f" 👤 Who: {analysis.get('who', 'N/A')[:60]}")
|
|
print(f" 📍 Where: {analysis.get('where', 'N/A')[:60]}")
|
|
print(f" 💡 Why: {analysis.get('why', 'N/A')[:60]}")
|
|
|
|
if update_parent_chunk(parent, analysis):
|
|
success_count += 1
|
|
else:
|
|
print(" ❌ Failed to generate analysis")
|
|
|
|
print(f"\n{'=' * 70}")
|
|
print(
|
|
f"✅ Updated {success_count}/{len(parents)} parent chunks with 5W1H summaries"
|
|
)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|