fix: TKG stats API returning 0 - count_by_type used wrong column
- tkg_nodes has no edge_type column, query was failing silently - Split into count_nodes(node_type) and count_edges(edge_type) - Fixed text_region → text_trace node type name - Also: OCR frame fix in rule1 (end_frame computed from end_time+FPS)
This commit is contained in:
162
docs_v1.0/DESIGN/Audio_Scene_Detection_POC.md
Normal file
162
docs_v1.0/DESIGN/Audio_Scene_Detection_POC.md
Normal file
@@ -0,0 +1,162 @@
|
||||
---
|
||||
title: Audio Scene & Instrument Detection POC Plan
|
||||
version: 0.1
|
||||
date: 2026-07-02
|
||||
author: OpenCode
|
||||
status: planned
|
||||
---
|
||||
|
||||
| scope | status | applicable to |
|
||||
|-------|--------|---------------|
|
||||
| Audio processing pipeline | planned | Video files with non-speech audio |
|
||||
|
||||
## Goal
|
||||
|
||||
Detect non-speech audio events (instruments, music, environmental sounds) in video files alongside existing ASRX speech recognition.
|
||||
|
||||
## Why
|
||||
|
||||
Current pipeline only detects speech (ASRX → 64 segments + 1554 speaker embeddings). Instrument sounds, background music, and environmental audio are completely ignored.
|
||||
|
||||
## Technical Options
|
||||
|
||||
### Option A: PANNs (Pre-trained Audio Neural Networks)
|
||||
- **Model**: Cnn14 (313M params, 700MB weights)
|
||||
- **Classes**: 527 AudioSet classes (piano, guitar, drums, speech, etc.)
|
||||
- **Pros**: Production-ready, accurate, PyTorch-based
|
||||
- **Cons**: Large download, ~200MB RAM per inference
|
||||
- **Install**: `pip install panns-inference`
|
||||
|
||||
### Option B: YAMNet (Google)
|
||||
- **Model**: MobileNet-based, 4MB weights
|
||||
- **Classes**: 521 AudioSet classes
|
||||
- **Pros**: Lightweight, fast
|
||||
- **Cons**: Requires TensorFlow (not currently installed)
|
||||
- **Install**: `pip install yamnet` + TensorFlow
|
||||
|
||||
### Option C: torchaudio + heuristics (lightweight fallback)
|
||||
- Use existing PyTorch + torchaudio
|
||||
- Extract spectral features (MFCC, centroid, energy)
|
||||
- Simple classification: speech vs music vs silence
|
||||
- **Pros**: No extra dependencies
|
||||
- **Cons**: Less accurate, limited classes
|
||||
|
||||
## Recommended: Option A (PANNs)
|
||||
|
||||
## Pipeline Integration
|
||||
|
||||
```
|
||||
Video → Audio Extract → ASRX (speech) → Speaker Embeddings (3.4/s)
|
||||
→ Audio Scene (new) → Scene Labels (1/s)
|
||||
```
|
||||
|
||||
### New Processor: `audio_scene`
|
||||
|
||||
| Field | Value |
|
||||
|-------|-------|
|
||||
| Processor type | `audio_scene` |
|
||||
| Input | Video file (audio track) |
|
||||
| Output | `file_uuid.audio_scene.json` |
|
||||
| Sampling | 1-second segments |
|
||||
| Qdrant collection | `momentry_{schema}_audio_scene` |
|
||||
|
||||
### Output Format
|
||||
|
||||
```json
|
||||
{
|
||||
"file_uuid": "...",
|
||||
"segments": [
|
||||
{
|
||||
"start_time": 0.0,
|
||||
"end_time": 1.0,
|
||||
"primary_class": "speech",
|
||||
"confidence": 0.95,
|
||||
"top_classes": [
|
||||
{"class": "speech", "score": 0.95},
|
||||
{"class": "music", "score": 0.03},
|
||||
{"class": "piano", "score": 0.01}
|
||||
]
|
||||
}
|
||||
],
|
||||
"summary": {
|
||||
"speech_ratio": 0.72,
|
||||
"music_ratio": 0.15,
|
||||
"silence_ratio": 0.08,
|
||||
"instrument_ratio": 0.05,
|
||||
"instruments_detected": ["piano", "guitar"]
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### Qdrant Storage
|
||||
|
||||
| Field | Type | Purpose |
|
||||
|-------|------|---------|
|
||||
| `file_uuid` | string | Filter by file |
|
||||
| `start_time` | float | Segment start |
|
||||
| `end_time` | float | Segment end |
|
||||
| `primary_class` | keyword | Filter by class |
|
||||
| `confidence` | float | Filter by confidence |
|
||||
| `instrument_name` | keyword | Search by instrument |
|
||||
| `vector` | f32[2048] | Audio embedding for similarity search |
|
||||
|
||||
### Processor Dependencies
|
||||
|
||||
```
|
||||
audio_scene → (no dependencies, runs parallel with ASRX)
|
||||
```
|
||||
|
||||
## Key AudioSet Instrument Classes
|
||||
|
||||
| Category | Classes |
|
||||
|----------|---------|
|
||||
| Piano | Piano, Electric piano, Keyboard |
|
||||
| Guitar | Guitar, Electric guitar, Acoustic guitar |
|
||||
| Drums | Drum kit, Snare drum, Cymbal, Hi-hat |
|
||||
| Strings | Violin, Cello, Harp, Double bass |
|
||||
| Wind | Flute, Saxophone, Trumpet, Clarinet |
|
||||
| Voice | Speech, Singing, Chant, Choir |
|
||||
| Other | Music, Percussion, Organ, Synthesizer |
|
||||
|
||||
## POC Steps
|
||||
|
||||
1. **Install panns-inference**
|
||||
```bash
|
||||
pip install panns-inference
|
||||
```
|
||||
|
||||
2. **Create `scripts/audio_scene_processor.py`**
|
||||
- Load audio via ffmpeg → numpy array
|
||||
- Process 1-second segments through Cnn14
|
||||
- Save results to JSON + Qdrant
|
||||
|
||||
3. **Add processor type to pipeline**
|
||||
- Add `AudioScene` to `ProcessorType` enum
|
||||
- Add to worker's processor dispatch
|
||||
- Add `AUDIO_SCENE_TIMEOUT` config
|
||||
|
||||
4. **Test with existing video**
|
||||
- Run on KOBA interview video
|
||||
- Verify instrument detection accuracy
|
||||
- Check performance (time, memory)
|
||||
|
||||
5. **Integrate with search**
|
||||
- Add audio_scene to universal_search
|
||||
- Add filter by audio class (speech/music/instrument)
|
||||
|
||||
## Estimated Effort
|
||||
|
||||
| Step | Time |
|
||||
|------|------|
|
||||
| Install + prototype script | 2-3 hours |
|
||||
| Pipeline integration | 1-2 hours |
|
||||
| Qdrant + search integration | 1 hour |
|
||||
| Testing + tuning | 1-2 hours |
|
||||
| **Total** | **5-8 hours** |
|
||||
|
||||
## Future Enhancements
|
||||
|
||||
- Real-time audio classification during processing
|
||||
- Audio event timeline visualization
|
||||
- Combine with TKG for audio-visual relationships
|
||||
- Background music detection for copyright checks
|
||||
@@ -836,24 +836,31 @@ async fn get_file_stats(
|
||||
let tkg_nodes_table = schema::table_name("tkg_nodes");
|
||||
let tkg_edges_table = schema::table_name("tkg_edges");
|
||||
|
||||
let tkg_nodes_total: i64 = sqlx::query_scalar::<_, i64>(&format!("SELECT COUNT(*) FROM {} WHERE file_uuid = $1", tkg_nodes_table))
|
||||
.bind(&file_uuid).fetch_one(pool).await.unwrap_or(0);
|
||||
let tkg_edges_total: i64 = sqlx::query_scalar::<_, i64>(&format!("SELECT COUNT(*) FROM {} WHERE file_uuid = $1", tkg_edges_table))
|
||||
.bind(&file_uuid).fetch_one(pool).await.unwrap_or(0);
|
||||
|
||||
let tkg = TkgFileStats {
|
||||
face_track_nodes: count_by_type(pool, &tkg_nodes_table, &file_uuid, "face_track").await,
|
||||
gaze_track_nodes: count_by_type(pool, &tkg_nodes_table, &file_uuid, "gaze_track").await,
|
||||
lip_track_nodes: count_by_type(pool, &tkg_nodes_table, &file_uuid, "lip_track").await,
|
||||
text_region_nodes: count_by_type(pool, &tkg_nodes_table, &file_uuid, "text_region").await,
|
||||
appearance_nodes: count_by_type(pool, &tkg_nodes_table, &file_uuid, "appearance_trace").await,
|
||||
accessory_nodes: count_by_type(pool, &tkg_nodes_table, &file_uuid, "accessory").await,
|
||||
object_nodes: count_by_type(pool, &tkg_nodes_table, &file_uuid, "yolo_object").await,
|
||||
hand_nodes: count_by_type(pool, &tkg_nodes_table, &file_uuid, "hand").await,
|
||||
speaker_nodes: count_by_type(pool, &tkg_nodes_table, &file_uuid, "speaker").await,
|
||||
co_occurrence_edges: count_by_type(pool, &tkg_edges_table, &file_uuid, "CO_OCCURS_WITH").await,
|
||||
speaker_face_edges: count_by_type(pool, &tkg_edges_table, &file_uuid, "SPEAKS_AS").await,
|
||||
face_face_edges: count_by_type(pool, &tkg_edges_table, &file_uuid, "FACE_TO_FACE").await,
|
||||
mutual_gaze_edges: count_by_type(pool, &tkg_edges_table, &file_uuid, "MUTUAL_GAZE").await,
|
||||
lip_sync_edges: count_by_type(pool, &tkg_edges_table, &file_uuid, "LIP_SYNC").await,
|
||||
has_appearance_edges: count_by_type(pool, &tkg_edges_table, &file_uuid, "HAS_APPEARANCE").await,
|
||||
wears_edges: count_by_type(pool, &tkg_edges_table, &file_uuid, "WEARS").await,
|
||||
hand_object_edges: count_by_type(pool, &tkg_edges_table, &file_uuid, "HAND_OBJECT").await,
|
||||
total_nodes: tkg_nodes_total,
|
||||
total_edges: tkg_edges_total,
|
||||
face_track_nodes: count_nodes(pool, &tkg_nodes_table, &file_uuid, "face_track").await,
|
||||
gaze_track_nodes: count_nodes(pool, &tkg_nodes_table, &file_uuid, "gaze_track").await,
|
||||
lip_track_nodes: count_nodes(pool, &tkg_nodes_table, &file_uuid, "lip_track").await,
|
||||
text_region_nodes: count_nodes(pool, &tkg_nodes_table, &file_uuid, "text_trace").await,
|
||||
appearance_nodes: count_nodes(pool, &tkg_nodes_table, &file_uuid, "appearance_trace").await,
|
||||
accessory_nodes: count_nodes(pool, &tkg_nodes_table, &file_uuid, "accessory").await,
|
||||
object_nodes: count_nodes(pool, &tkg_nodes_table, &file_uuid, "yolo_object").await,
|
||||
hand_nodes: count_nodes(pool, &tkg_nodes_table, &file_uuid, "hand").await,
|
||||
speaker_nodes: count_nodes(pool, &tkg_nodes_table, &file_uuid, "speaker").await,
|
||||
co_occurrence_edges: count_edges(pool, &tkg_edges_table, &file_uuid, "CO_OCCURS_WITH").await,
|
||||
speaker_face_edges: count_edges(pool, &tkg_edges_table, &file_uuid, "SPEAKS_AS").await,
|
||||
face_face_edges: count_edges(pool, &tkg_edges_table, &file_uuid, "FACE_TO_FACE").await,
|
||||
mutual_gaze_edges: count_edges(pool, &tkg_edges_table, &file_uuid, "MUTUAL_GAZE").await,
|
||||
lip_sync_edges: count_edges(pool, &tkg_edges_table, &file_uuid, "LIP_SYNC").await,
|
||||
has_appearance_edges: count_edges(pool, &tkg_edges_table, &file_uuid, "HAS_APPEARANCE").await,
|
||||
wears_edges: count_edges(pool, &tkg_edges_table, &file_uuid, "WEARS").await,
|
||||
hand_object_edges: count_edges(pool, &tkg_edges_table, &file_uuid, "HAND_OBJECT").await,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
@@ -890,13 +897,25 @@ async fn get_file_stats(
|
||||
}))
|
||||
}
|
||||
|
||||
async fn count_by_type(pool: &sqlx::PgPool, table: &str, file_uuid: &str, type_val: &str) -> i64 {
|
||||
async fn count_nodes(pool: &sqlx::PgPool, table: &str, file_uuid: &str, node_type: &str) -> i64 {
|
||||
sqlx::query_scalar::<_, i64>(&format!(
|
||||
"SELECT COUNT(*) FROM {} WHERE file_uuid = $1 AND (node_type = $2 OR edge_type = $2)",
|
||||
"SELECT COUNT(*) FROM {} WHERE file_uuid = $1 AND node_type = $2",
|
||||
table
|
||||
))
|
||||
.bind(file_uuid)
|
||||
.bind(type_val)
|
||||
.bind(node_type)
|
||||
.fetch_one(pool)
|
||||
.await
|
||||
.unwrap_or(0)
|
||||
}
|
||||
|
||||
async fn count_edges(pool: &sqlx::PgPool, table: &str, file_uuid: &str, edge_type: &str) -> i64 {
|
||||
sqlx::query_scalar::<_, i64>(&format!(
|
||||
"SELECT COUNT(*) FROM {} WHERE file_uuid = $1 AND edge_type = $2",
|
||||
table
|
||||
))
|
||||
.bind(file_uuid)
|
||||
.bind(edge_type)
|
||||
.fetch_one(pool)
|
||||
.await
|
||||
.unwrap_or(0)
|
||||
|
||||
@@ -13,7 +13,7 @@ pub async fn execute_rule1(db: &PostgresDb, file_uuid: &str, fps: f64) -> Result
|
||||
let pool = db.pool();
|
||||
let pre_chunks_table = schema::table_name("pre_chunks");
|
||||
|
||||
let asr_segments = fetch_asr_segments(pool, file_uuid, &pre_chunks_table).await?;
|
||||
let asr_segments = fetch_asr_segments(pool, file_uuid, &pre_chunks_table, fps).await?;
|
||||
let ocr_map = fetch_ocr_texts(pool, file_uuid, &pre_chunks_table).await?;
|
||||
|
||||
let video = db
|
||||
@@ -97,6 +97,7 @@ async fn fetch_asr_segments(
|
||||
pool: &PgPool,
|
||||
file_uuid: &str,
|
||||
table: &str,
|
||||
fps: f64,
|
||||
) -> Result<Vec<AsrSegment>> {
|
||||
let query = format!(
|
||||
r#"
|
||||
@@ -114,8 +115,6 @@ async fn fetch_asr_segments(
|
||||
let segments: Vec<AsrSegment> = rows
|
||||
.iter()
|
||||
.map(|row| {
|
||||
let start_frame: i64 = row.try_get("start_frame").unwrap_or(0);
|
||||
let end_frame: i64 = row.try_get("end_frame").unwrap_or(0);
|
||||
let start_time: f64 = row.try_get("start_time").unwrap_or(0.0);
|
||||
let end_time_raw: Option<f64> = row.try_get("end_time").ok();
|
||||
let data: Value = row.try_get("data").unwrap_or(Value::Null);
|
||||
@@ -124,6 +123,13 @@ async fn fetch_asr_segments(
|
||||
.or_else(|| data.get("end_time").and_then(|v| v.as_f64()))
|
||||
.unwrap_or(0.0);
|
||||
|
||||
let start_frame = (start_time * fps) as i64;
|
||||
let end_frame = if end_time > 0.0 {
|
||||
(end_time * fps) as i64
|
||||
} else {
|
||||
start_frame
|
||||
};
|
||||
|
||||
if end_time <= 0.0 {
|
||||
warn!(
|
||||
"ASR segment end_time is 0.0 for file {} (frame {}..{})",
|
||||
|
||||
Reference in New Issue
Block a user