fix: TKG stats API returning 0 - count_by_type used wrong column

- tkg_nodes has no edge_type column, query was failing silently - Split into count_nodes(node_type) and count_edges(edge_type) - Fixed text_region → text_trace node type name - Also: OCR frame fix in rule1 (end_frame computed from end_time+FPS)
2026-07-02 14:53:47 +08:00
parent 6507766ea2
commit 619b056ada
3 changed files with 210 additions and 23 deletions
--- a/docs_v1.0/DESIGN/Audio_Scene_Detection_POC.md
+++ b/docs_v1.0/DESIGN/Audio_Scene_Detection_POC.md
@@ -0,0 +1,162 @@
+---
+title: Audio Scene & Instrument Detection POC Plan
+version: 0.1
+date: 2026-07-02
+author: OpenCode
+status: planned
+---
+
+| scope | status | applicable to |
+|-------|--------|---------------|
+| Audio processing pipeline | planned | Video files with non-speech audio |
+
+## Goal
+
+Detect non-speech audio events (instruments, music, environmental sounds) in video files alongside existing ASRX speech recognition.
+
+## Why
+
+Current pipeline only detects speech (ASRX → 64 segments + 1554 speaker embeddings). Instrument sounds, background music, and environmental audio are completely ignored.
+
+## Technical Options
+
+### Option A: PANNs (Pre-trained Audio Neural Networks)
+- **Model**: Cnn14 (313M params, 700MB weights)
+- **Classes**: 527 AudioSet classes (piano, guitar, drums, speech, etc.)
+- **Pros**: Production-ready, accurate, PyTorch-based
+- **Cons**: Large download, ~200MB RAM per inference
+- **Install**: `pip install panns-inference`
+
+### Option B: YAMNet (Google)
+- **Model**: MobileNet-based, 4MB weights
+- **Classes**: 521 AudioSet classes
+- **Pros**: Lightweight, fast
+- **Cons**: Requires TensorFlow (not currently installed)
+- **Install**: `pip install yamnet` + TensorFlow
+
+### Option C: torchaudio + heuristics (lightweight fallback)
+- Use existing PyTorch + torchaudio
+- Extract spectral features (MFCC, centroid, energy)
+- Simple classification: speech vs music vs silence
+- **Pros**: No extra dependencies
+- **Cons**: Less accurate, limited classes
+
+## Recommended: Option A (PANNs)
+
+## Pipeline Integration
+
+```
+Video → Audio Extract → ASRX (speech)  → Speaker Embeddings (3.4/s)
+                  → Audio Scene (new) → Scene Labels (1/s)
+```
+
+### New Processor: `audio_scene`
+
+| Field | Value |
+|-------|-------|
+| Processor type | `audio_scene` |
+| Input | Video file (audio track) |
+| Output | `file_uuid.audio_scene.json` |
+| Sampling | 1-second segments |
+| Qdrant collection | `momentry_{schema}_audio_scene` |
+
+### Output Format
+
+```json
+{
+  "file_uuid": "...",
+  "segments": [
+    {
+      "start_time": 0.0,
+      "end_time": 1.0,
+      "primary_class": "speech",
+      "confidence": 0.95,
+      "top_classes": [
+        {"class": "speech", "score": 0.95},
+        {"class": "music", "score": 0.03},
+        {"class": "piano", "score": 0.01}
+      ]
+    }
+  ],
+  "summary": {
+    "speech_ratio": 0.72,
+    "music_ratio": 0.15,
+    "silence_ratio": 0.08,
+    "instrument_ratio": 0.05,
+    "instruments_detected": ["piano", "guitar"]
+  }
+}
+```
+
+### Qdrant Storage
+
+| Field | Type | Purpose |
+|-------|------|---------|
+| `file_uuid` | string | Filter by file |
+| `start_time` | float | Segment start |
+| `end_time` | float | Segment end |
+| `primary_class` | keyword | Filter by class |
+| `confidence` | float | Filter by confidence |
+| `instrument_name` | keyword | Search by instrument |
+| `vector` | f32[2048] | Audio embedding for similarity search |
+
+### Processor Dependencies
+
+```
+audio_scene → (no dependencies, runs parallel with ASRX)
+```
+
+## Key AudioSet Instrument Classes
+
+| Category | Classes |
+|----------|---------|
+| Piano | Piano, Electric piano, Keyboard |
+| Guitar | Guitar, Electric guitar, Acoustic guitar |
+| Drums | Drum kit, Snare drum, Cymbal, Hi-hat |
+| Strings | Violin, Cello, Harp, Double bass |
+| Wind | Flute, Saxophone, Trumpet, Clarinet |
+| Voice | Speech, Singing, Chant, Choir |
+| Other | Music, Percussion, Organ, Synthesizer |
+
+## POC Steps
+
+1. **Install panns-inference**
+   ```bash
+   pip install panns-inference
+   ```
+
+2. **Create `scripts/audio_scene_processor.py`**
+   - Load audio via ffmpeg → numpy array
+   - Process 1-second segments through Cnn14
+   - Save results to JSON + Qdrant
+
+3. **Add processor type to pipeline**
+   - Add `AudioScene` to `ProcessorType` enum
+   - Add to worker's processor dispatch
+   - Add `AUDIO_SCENE_TIMEOUT` config
+
+4. **Test with existing video**
+   - Run on KOBA interview video
+   - Verify instrument detection accuracy
+   - Check performance (time, memory)
+
+5. **Integrate with search**
+   - Add audio_scene to universal_search
+   - Add filter by audio class (speech/music/instrument)
+
+## Estimated Effort
+
+| Step | Time |
+|------|------|
+| Install + prototype script | 2-3 hours |
+| Pipeline integration | 1-2 hours |
+| Qdrant + search integration | 1 hour |
+| Testing + tuning | 1-2 hours |
+| **Total** | **5-8 hours** |
+
+## Future Enhancements
+
+- Real-time audio classification during processing
+- Audio event timeline visualization
+- Combine with TKG for audio-visual relationships
+- Background music detection for copyright checks
--- a/src/api/scan.rs
+++ b/src/api/scan.rs
@@ -836,24 +836,31 @@ async fn get_file_stats(
    let tkg_nodes_table = schema::table_name("tkg_nodes");
    let tkg_edges_table = schema::table_name("tkg_edges");

+    let tkg_nodes_total: i64 = sqlx::query_scalar::<_, i64>(&format!("SELECT COUNT(*) FROM {} WHERE file_uuid = $1", tkg_nodes_table))
+        .bind(&file_uuid).fetch_one(pool).await.unwrap_or(0);
+    let tkg_edges_total: i64 = sqlx::query_scalar::<_, i64>(&format!("SELECT COUNT(*) FROM {} WHERE file_uuid = $1", tkg_edges_table))
+        .bind(&file_uuid).fetch_one(pool).await.unwrap_or(0);
+
    let tkg = TkgFileStats {
-        face_track_nodes: count_by_type(pool, &tkg_nodes_table, &file_uuid, "face_track").await,
-        gaze_track_nodes: count_by_type(pool, &tkg_nodes_table, &file_uuid, "gaze_track").await,
-        lip_track_nodes: count_by_type(pool, &tkg_nodes_table, &file_uuid, "lip_track").await,
-        text_region_nodes: count_by_type(pool, &tkg_nodes_table, &file_uuid, "text_region").await,
-        appearance_nodes: count_by_type(pool, &tkg_nodes_table, &file_uuid, "appearance_trace").await,
-        accessory_nodes: count_by_type(pool, &tkg_nodes_table, &file_uuid, "accessory").await,
-        object_nodes: count_by_type(pool, &tkg_nodes_table, &file_uuid, "yolo_object").await,
-        hand_nodes: count_by_type(pool, &tkg_nodes_table, &file_uuid, "hand").await,
-        speaker_nodes: count_by_type(pool, &tkg_nodes_table, &file_uuid, "speaker").await,
-        co_occurrence_edges: count_by_type(pool, &tkg_edges_table, &file_uuid, "CO_OCCURS_WITH").await,
-        speaker_face_edges: count_by_type(pool, &tkg_edges_table, &file_uuid, "SPEAKS_AS").await,
-        face_face_edges: count_by_type(pool, &tkg_edges_table, &file_uuid, "FACE_TO_FACE").await,
-        mutual_gaze_edges: count_by_type(pool, &tkg_edges_table, &file_uuid, "MUTUAL_GAZE").await,
-        lip_sync_edges: count_by_type(pool, &tkg_edges_table, &file_uuid, "LIP_SYNC").await,
-        has_appearance_edges: count_by_type(pool, &tkg_edges_table, &file_uuid, "HAS_APPEARANCE").await,
-        wears_edges: count_by_type(pool, &tkg_edges_table, &file_uuid, "WEARS").await,
-        hand_object_edges: count_by_type(pool, &tkg_edges_table, &file_uuid, "HAND_OBJECT").await,
+        total_nodes: tkg_nodes_total,
+        total_edges: tkg_edges_total,
+        face_track_nodes: count_nodes(pool, &tkg_nodes_table, &file_uuid, "face_track").await,
+        gaze_track_nodes: count_nodes(pool, &tkg_nodes_table, &file_uuid, "gaze_track").await,
+        lip_track_nodes: count_nodes(pool, &tkg_nodes_table, &file_uuid, "lip_track").await,
+        text_region_nodes: count_nodes(pool, &tkg_nodes_table, &file_uuid, "text_trace").await,
+        appearance_nodes: count_nodes(pool, &tkg_nodes_table, &file_uuid, "appearance_trace").await,
+        accessory_nodes: count_nodes(pool, &tkg_nodes_table, &file_uuid, "accessory").await,
+        object_nodes: count_nodes(pool, &tkg_nodes_table, &file_uuid, "yolo_object").await,
+        hand_nodes: count_nodes(pool, &tkg_nodes_table, &file_uuid, "hand").await,
+        speaker_nodes: count_nodes(pool, &tkg_nodes_table, &file_uuid, "speaker").await,
+        co_occurrence_edges: count_edges(pool, &tkg_edges_table, &file_uuid, "CO_OCCURS_WITH").await,
+        speaker_face_edges: count_edges(pool, &tkg_edges_table, &file_uuid, "SPEAKS_AS").await,
+        face_face_edges: count_edges(pool, &tkg_edges_table, &file_uuid, "FACE_TO_FACE").await,
+        mutual_gaze_edges: count_edges(pool, &tkg_edges_table, &file_uuid, "MUTUAL_GAZE").await,
+        lip_sync_edges: count_edges(pool, &tkg_edges_table, &file_uuid, "LIP_SYNC").await,
+        has_appearance_edges: count_edges(pool, &tkg_edges_table, &file_uuid, "HAS_APPEARANCE").await,
+        wears_edges: count_edges(pool, &tkg_edges_table, &file_uuid, "WEARS").await,
+        hand_object_edges: count_edges(pool, &tkg_edges_table, &file_uuid, "HAND_OBJECT").await,
        ..Default::default()
    };

@@ -890,13 +897,25 @@ async fn get_file_stats(
    }))
 }

-async fn count_by_type(pool: &sqlx::PgPool, table: &str, file_uuid: &str, type_val: &str) -> i64 {
+async fn count_nodes(pool: &sqlx::PgPool, table: &str, file_uuid: &str, node_type: &str) -> i64 {
    sqlx::query_scalar::<_, i64>(&format!(
-        "SELECT COUNT(*) FROM {} WHERE file_uuid = $1 AND (node_type = $2 OR edge_type = $2)",
+        "SELECT COUNT(*) FROM {} WHERE file_uuid = $1 AND node_type = $2",
        table
    ))
    .bind(file_uuid)
-    .bind(type_val)
+    .bind(node_type)
+    .fetch_one(pool)
+    .await
+    .unwrap_or(0)
+}
+
+async fn count_edges(pool: &sqlx::PgPool, table: &str, file_uuid: &str, edge_type: &str) -> i64 {
+    sqlx::query_scalar::<_, i64>(&format!(
+        "SELECT COUNT(*) FROM {} WHERE file_uuid = $1 AND edge_type = $2",
+        table
+    ))
+    .bind(file_uuid)
+    .bind(edge_type)
    .fetch_one(pool)
    .await
    .unwrap_or(0)
--- a/src/core/chunk/rule1_ingest.rs
+++ b/src/core/chunk/rule1_ingest.rs
@@ -13,7 +13,7 @@ pub async fn execute_rule1(db: &PostgresDb, file_uuid: &str, fps: f64) -> Result
    let pool = db.pool();
    let pre_chunks_table = schema::table_name("pre_chunks");

-    let asr_segments = fetch_asr_segments(pool, file_uuid, &pre_chunks_table).await?;
+    let asr_segments = fetch_asr_segments(pool, file_uuid, &pre_chunks_table, fps).await?;
    let ocr_map = fetch_ocr_texts(pool, file_uuid, &pre_chunks_table).await?;

    let video = db
@@ -97,6 +97,7 @@ async fn fetch_asr_segments(
    pool: &PgPool,
    file_uuid: &str,
    table: &str,
+    fps: f64,
 ) -> Result<Vec<AsrSegment>> {
    let query = format!(
        r#"
@@ -114,8 +115,6 @@ async fn fetch_asr_segments(
    let segments: Vec<AsrSegment> = rows
        .iter()
        .map(|row| {
-            let start_frame: i64 = row.try_get("start_frame").unwrap_or(0);
-            let end_frame: i64 = row.try_get("end_frame").unwrap_or(0);
            let start_time: f64 = row.try_get("start_time").unwrap_or(0.0);
            let end_time_raw: Option<f64> = row.try_get("end_time").ok();
            let data: Value = row.try_get("data").unwrap_or(Value::Null);
@@ -124,6 +123,13 @@ async fn fetch_asr_segments(
                .or_else(|| data.get("end_time").and_then(|v| v.as_f64()))
                .unwrap_or(0.0);

+            let start_frame = (start_time * fps) as i64;
+            let end_frame = if end_time > 0.0 {
+                (end_time * fps) as i64
+            } else {
+                start_frame
+            };
+
            if end_time <= 0.0 {
                warn!(
                    "ASR segment end_time is 0.0 for file {} (frame {}..{})",