#!/opt/homebrew/bin/python3.11 """ Local LLM (Gemma 4) Capability & Speed Benchmark """ import json import time import subprocess UUID = "384b0ff44aaaa1f1" ASR_PATH = f"output/{UUID}/{UUID}.asr.json" MODEL = "gemma4:latest" def load_context(n_segments=20): try: with open(ASR_PATH, "r") as f: data = json.load(f) segments = data.get("segments", [])[50 : 50 + n_segments] # Pick a middle chunk text = " ".join([s.get("text", "") for s in segments]) return text except Exception as e: return f"Error loading context: {e}" def run_test(name, prompt_template, context_text): print(f"\n๐Ÿงช Testing: {name}") print("-" * 50) prompt = prompt_template.format(context=context_text) full_input = f"{prompt}\n\nContext:\n{context_text}" start = time.time() try: result = subprocess.run( ["ollama", "run", MODEL, full_input], capture_output=True, text=True, timeout=120, ) duration = time.time() - start output = result.stdout.strip() # Check if it's JSON (basic check) is_json = output.startswith("{") and output.endswith("}") tag = "JSON โœ…" if is_json else "Text โš ๏ธ" print(f"โฑ๏ธ Duration: {duration:.2f}s | Format: {tag}") print(f"๐Ÿค– Output: {output[:300]}...") return duration, output except Exception as e: duration = time.time() - start print(f"โŒ Failed ({duration:.2f}s): {e}") return duration, None def main(): print(f"๐Ÿš€ Starting Gemma 4 Capability Test on Context ({MODEL})") context = load_context() print(f"๐Ÿ“‚ Loaded Context: {len(context)} chars") if len(context) < 50: print("โš ๏ธ Context too short, aborting.") return print(f"๐Ÿ‘€ Preview: {context[:100]}...") results = [] # Test 1: Summarization results.append( run_test( "1. Plot Summarization (ๆ‘˜่ฆ)", "Summarize the following movie dialogue into ONE sentence. Do not explain, just give the summary.", context, ) ) # Test 2: 5W1H Extraction results.append( run_test( "2. 5W1H Entity Extraction (่ณ‡่จŠๆๅ–)", "Extract the following information from the text and output valid JSON only:\n{{'who': '...', 'what': '...', 'where': '...', 'when': '...'}}.", context, ) ) # Test 3: Sentiment Analysis results.append( run_test( "3. Sentiment & Mood Detection (ๆƒ…็ท’ๅˆ†ๆž)", "Analyze the emotional tone of the dialogue. Output JSON: {{'mood': ['...'], 'tension_level': 'high/medium/low'}}.", context, ) ) # Test 4: Logical Reasoning (Plot Deduction) results.append( run_test( "4. Logical Reasoning (้‚่ผฏๆŽจ็†)", "Based on the text, answer: What are the characters discussing or investigating? Be specific.", context, ) ) # Summary valid_results = [r[0] for r in results if r[0] is not None] if valid_results: total = sum(valid_results) avg = total / len(valid_results) print("\n๐Ÿ“Š Benchmark Summary:") print(f"Total Time for 4 tasks: {total:.2f}s") print(f"Average Time: {avg:.2f}s per task") if avg > 20: print( "\nโš ๏ธ Note: Gemma 4 is accurate but slow. Consider asynchronous processing or smaller models for speed." ) else: print("\nโœ… Note: Performance is acceptable for background tasks.") if __name__ == "__main__": main()