feat: trace-level matching, health watcher/worker status, timezone config

This commit is contained in:
Accusys
2026-05-21 01:08:30 +08:00
parent 8ede4be159
commit bebaa743ed
60 changed files with 6110 additions and 1586 deletions

View File

@@ -75,15 +75,13 @@ pub async fn ingest_rule3(pool: &PgPool, file_uuid: &str) -> Result<usize> {
// Query chunks table for Rule 1 sentence chunks
let chunk_table = schema::table_name("chunk");
let rule1_rows: Vec<(String,)> = sqlx::query_as(
&format!(
"SELECT chunk_id FROM {} \
let rule1_rows: Vec<(String,)> = sqlx::query_as(&format!(
"SELECT chunk_id FROM {} \
WHERE file_uuid = $1 AND chunk_type = 'sentence' \
AND start_frame >= $2 \
AND end_frame <= $3",
chunk_table
),
)
chunk_table
))
.bind(file_uuid)
.bind(scene.start_frame as i64)
.bind(scene.end_frame as i64)
@@ -101,16 +99,14 @@ pub async fn ingest_rule3(pool: &PgPool, file_uuid: &str) -> Result<usize> {
// Let's re-query text directly.
}
let texts: Vec<String> = sqlx::query_scalar(
&format!(
"SELECT text_content FROM {} \
let texts: Vec<String> = sqlx::query_scalar(&format!(
"SELECT text_content FROM {} \
WHERE file_uuid = $1 AND chunk_type = 'sentence' \
AND start_frame >= $2 \
AND end_frame <= $3 \
ORDER BY start_frame ASC",
chunk_table
),
)
chunk_table
))
.bind(file_uuid)
.bind(scene.start_frame as i64)
.bind(scene.end_frame as i64)
@@ -154,16 +150,14 @@ pub async fn ingest_rule3(pool: &PgPool, file_uuid: &str) -> Result<usize> {
"scene_number": scene.scene_number
});
sqlx::query(
&format!(
"INSERT INTO {} (file_uuid, chunk_id, chunk_type, \
sqlx::query(&format!(
"INSERT INTO {} (file_uuid, chunk_id, chunk_type, \
start_time, end_time, fps, start_frame, end_frame, \
content, text_content, summary_text, metadata, child_chunk_ids) \
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13) \
ON CONFLICT (file_uuid, chunk_id) DO NOTHING",
chunk_table
),
)
chunk_table
))
.bind(file_uuid)
.bind(&chunk_id)
.bind(scene.scene_number as i32)

View File

@@ -20,8 +20,7 @@ pub fn set_cache_enabled(enabled: bool) {
}
// Switch 1: watcher detects new file → auto-register
pub static RUNTIME_WATCHER_AUTO_REGISTER: Lazy<RwLock<bool>> =
Lazy::new(|| RwLock::new(false));
pub static RUNTIME_WATCHER_AUTO_REGISTER: Lazy<RwLock<bool>> = Lazy::new(|| RwLock::new(false));
pub fn get_watcher_auto_register() -> bool {
*RUNTIME_WATCHER_AUTO_REGISTER.read().unwrap()
@@ -33,8 +32,7 @@ pub fn set_watcher_auto_register(enabled: bool) {
}
// Switch 2: register → auto-trigger processing pipeline
pub static RUNTIME_AUTO_PIPELINE_ENABLED: Lazy<RwLock<bool>> =
Lazy::new(|| RwLock::new(false));
pub static RUNTIME_AUTO_PIPELINE_ENABLED: Lazy<RwLock<bool>> = Lazy::new(|| RwLock::new(false));
pub fn get_auto_pipeline_enabled() -> bool {
*RUNTIME_AUTO_PIPELINE_ENABLED.read().unwrap()
@@ -107,6 +105,30 @@ pub static REDIS_KEY_PREFIX: Lazy<String> =
pub static DATABASE_SCHEMA: Lazy<String> =
Lazy::new(|| env::var("DATABASE_SCHEMA").unwrap_or_else(|_| "public".to_string()));
pub static SYSTEM_TIMEZONE: Lazy<String> = Lazy::new(|| {
if let Ok(tz) = env::var("MOMENTRY_TIMEZONE") {
if !tz.is_empty() {
return tz;
}
}
if let Ok(tz) = env::var("TZ") {
if !tz.is_empty() {
return tz;
}
}
// macOS: /etc/localtime → /var/db/timezone/zoneinfo/Asia/Taipei
// Linux: /etc/localtime → /usr/share/zoneinfo/Asia/Taipei
if let Ok(path) = std::fs::read_link("/etc/localtime") {
let s = path.to_string_lossy();
for prefix in &["/usr/share/zoneinfo/", "/var/db/timezone/zoneinfo/"] {
if let Some(tz) = s.strip_prefix(prefix) {
return tz.to_string();
}
}
}
"Asia/Taipei".to_string()
});
pub static MONGODB_DATABASE: Lazy<String> =
Lazy::new(|| env::var("MONGODB_DATABASE").unwrap_or_else(|_| "momentry".to_string()));

File diff suppressed because it is too large Load Diff

View File

@@ -15,9 +15,11 @@ pub struct QdrantDb {
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct VectorPayload {
pub uuid: String,
pub file_uuid: String,
pub chunk_id: String,
pub chunk_type: String,
pub start_frame: i64,
pub end_frame: i64,
pub start_time: f64,
pub end_time: f64,
pub text: Option<String>,
@@ -189,6 +191,49 @@ impl QdrantDb {
Ok(())
}
pub async fn upsert_vectors_batch(
&self,
collection: &str,
points: &[(u64, &[f32], Option<serde_json::Value>)],
) -> Result<()> {
let url = format!(
"{}/collections/{}/points?wait=true",
self.base_url, collection
);
let qdrant_points: Vec<serde_json::Value> = points
.iter()
.map(|(id, vec, payload)| {
let mut p = serde_json::json!({
"id": id,
"vector": vec,
});
if let Some(pl) = payload {
p["payload"] = pl.clone();
}
p
})
.collect();
let body = serde_json::json!({ "points": qdrant_points });
let response = self
.client
.put(&url)
.header("api-key", &self.api_key)
.json(&body)
.send()
.await
.context("Failed to send batch upsert request to Qdrant")?;
let status = response.status();
if !status.is_success() {
let response_text = response.text().await.unwrap_or_default();
anyhow::bail!("Qdrant batch upsert failed: {} - {}", status, response_text);
}
Ok(())
}
pub async fn upsert_vector(
&self,
chunk_id: &str,
@@ -207,12 +252,23 @@ impl QdrantDb {
);
let mut payload_map = HashMap::new();
payload_map.insert("uuid".to_string(), serde_json::json!(payload.uuid));
payload_map.insert(
"file_uuid".to_string(),
serde_json::json!(payload.file_uuid),
);
payload_map.insert("chunk_id".to_string(), serde_json::json!(payload.chunk_id));
payload_map.insert(
"chunk_type".to_string(),
serde_json::json!(payload.chunk_type),
);
payload_map.insert(
"start_frame".to_string(),
serde_json::json!(payload.start_frame),
);
payload_map.insert(
"end_frame".to_string(),
serde_json::json!(payload.end_frame),
);
payload_map.insert(
"start_time".to_string(),
serde_json::json!(payload.start_time),
@@ -224,7 +280,7 @@ impl QdrantDb {
// Generate consistent point ID from uuid and chunk_id
// Qdrant requires integer or UUID point IDs. We'll use a simple integer hash.
let point_id_str = format!("{}_{}", payload.uuid, chunk_id);
let point_id_str = format!("{}_{}", payload.file_uuid, chunk_id);
use std::collections::hash_map::DefaultHasher;
use std::hash::{Hash, Hasher};
let mut hasher = DefaultHasher::new();
@@ -240,9 +296,9 @@ impl QdrantDb {
});
tracing::debug!(
"Upserting vector to Qdrant: chunk_id={}, uuid={}, vector_len={}",
"Upserting vector to Qdrant: chunk_id={}, file_uuid={}, vector_len={}",
chunk_id,
payload.uuid,
payload.file_uuid,
vector.len()
);
@@ -337,7 +393,7 @@ impl QdrantDb {
.map(|r| {
let uuid = r
.payload
.get("uuid")
.get("file_uuid")
.and_then(|v| v.as_str())
.unwrap_or("unknown")
.to_string();
@@ -409,7 +465,7 @@ impl QdrantDb {
.map(|r| {
let uuid = r
.payload
.get("uuid")
.get("file_uuid")
.and_then(|v| v.as_str())
.unwrap_or("unknown")
.to_string();
@@ -471,7 +527,7 @@ impl QdrantDb {
"filter": {
"must": [
{
"key": "uuid",
"key": "file_uuid",
"match": {
"value": uuid
}
@@ -532,7 +588,7 @@ impl QdrantDb {
.map(|r| {
let uuid = r
.payload
.get("uuid")
.get("file_uuid")
.and_then(|v| v.as_str())
.unwrap_or("unknown")
.to_string();
@@ -553,6 +609,89 @@ impl QdrantDb {
Ok(search_results)
}
pub async fn search_face_collection(
&self,
collection: &str,
query_vector: &[f32],
limit: usize,
exclude_payload_key: &str,
exclude_payload_value: &str,
include_file_uuid: Option<&str>,
) -> Result<Vec<(f64, HashMap<String, serde_json::Value>)>> {
let url = format!("{}/collections/{}/points/search", self.base_url, collection);
let mut filter = serde_json::json!({
"must_not": [
{
"key": exclude_payload_key,
"match": { "value": exclude_payload_value }
}
]
});
if let Some(file_uuid) = include_file_uuid {
filter["must"] = serde_json::json!([
{
"key": "file_uuid",
"match": { "value": file_uuid }
}
]);
}
let body = serde_json::json!({
"vector": query_vector,
"limit": limit,
"with_payload": true,
"filter": filter,
});
let response = self
.client
.post(&url)
.header("api-key", &self.api_key)
.header("Content-Type", "application/json")
.json(&body)
.send()
.await
.context("Failed to search Qdrant face collection")?;
let status = response.status();
let response_text = response
.text()
.await
.unwrap_or_else(|_| "Failed to read response".to_string());
if !status.is_success() {
return Err(anyhow::anyhow!(
"Qdrant search_face_collection failed: {} - {}",
status,
response_text
));
}
#[derive(Deserialize)]
struct QdrantSearchResult {
result: Vec<QdrantPoint>,
}
#[derive(Deserialize)]
struct QdrantPoint {
score: f64,
payload: HashMap<String, serde_json::Value>,
}
match serde_json::from_str::<QdrantSearchResult>(&response_text) {
Ok(parsed) => {
let results: Vec<(f64, HashMap<String, serde_json::Value>)> = parsed
.result
.into_iter()
.map(|r| (r.score, r.payload))
.collect();
Ok(results)
}
Err(e) => Err(anyhow::anyhow!("Failed to parse Qdrant response: {}", e)),
}
}
pub async fn delete_by_uuid(&self, uuid: &str) -> Result<()> {
let url = format!(
"{}/collections/{}/points/delete",
@@ -563,7 +702,7 @@ impl QdrantDb {
"filter": {
"must": [
{
"key": "uuid",
"key": "file_uuid",
"match": {
"value": uuid
}
@@ -711,9 +850,11 @@ impl Database for QdrantDb {
impl VectorStore for QdrantDb {
async fn store_vector(&self, chunk_id: &str, vector: &[f32]) -> Result<()> {
let payload = VectorPayload {
uuid: String::new(),
file_uuid: String::new(),
chunk_id: chunk_id.to_string(),
chunk_type: String::new(),
start_frame: 0,
end_frame: 0,
start_time: 0.0,
end_time: 0.0,
text: None,
@@ -737,7 +878,9 @@ pub async fn sync_face_embeddings(file_uuid: &str) -> Result<()> {
let qdrant: QdrantDb = QdrantDb::new();
let query = format!(
"SELECT id, trace_id, frame_number, embedding FROM {} WHERE file_uuid = $1 AND embedding IS NOT NULL",
"SELECT id, trace_id, frame_number, embedding FROM {} \
WHERE file_uuid = $1 AND embedding IS NOT NULL \
AND ((metadata->>'qc_ok')::boolean IS NULL OR (metadata->>'qc_ok')::boolean = true)",
table
);
let rows = sqlx::query(&query).bind(file_uuid).fetch_all(&pool).await?;
@@ -767,3 +910,103 @@ pub async fn sync_face_embeddings(file_uuid: &str) -> Result<()> {
);
Ok(())
}
pub async fn sync_trace_embeddings(file_uuid: &str) -> Result<()> {
use crate::core::config::DATABASE_URL;
use sqlx::Row;
let pool = sqlx::PgPool::connect(&DATABASE_URL).await?;
let table = crate::core::db::schema::table_name("face_detections");
let qdrant = QdrantDb::new();
let collection = format!(
"{}_traces",
crate::core::config::REDIS_KEY_PREFIX
.as_str()
.trim_end_matches(':')
);
qdrant.ensure_collection(&collection, 512).await?;
// Read all face_detections with embeddings, grouped by trace_id in Rust
let rows = sqlx::query(&format!(
"SELECT trace_id, embedding FROM {} \
WHERE file_uuid = $1 AND embedding IS NOT NULL AND trace_id IS NOT NULL \
AND ((metadata->>'qc_ok')::boolean IS NULL OR (metadata->>'qc_ok')::boolean = true)",
table
))
.bind(file_uuid)
.fetch_all(&pool)
.await?;
let mut trace_faces: std::collections::HashMap<i32, Vec<Vec<f32>>> =
std::collections::HashMap::new();
let mut trace_stats: std::collections::HashMap<i32, (i64, i64, i64)> =
std::collections::HashMap::new(); // (count, min_frame, max_frame)
for row in &rows {
let tid: Option<i32> = row.get(0);
let emb: Option<Vec<f32>> = row.get(1);
if let (Some(tid), Some(emb)) = (tid, emb) {
trace_faces.entry(tid).or_default().push(emb);
let entry = trace_stats.entry(tid).or_insert((0, i64::MAX, i64::MIN));
entry.0 += 1;
}
}
// Compute average embedding per trace
struct AvgTrace {
tid: i32,
avg_emb: Vec<f32>,
frame_count: i64,
}
let mut trace_avgs: Vec<AvgTrace> = Vec::new();
for (&tid, faces) in &trace_faces {
let dim = faces[0].len();
let mut avg = vec![0.0f32; dim];
for face in faces {
for (i, &v) in face.iter().enumerate() {
avg[i] += v;
}
}
let n = faces.len() as f32;
for v in &mut avg {
*v /= n;
}
let stats = trace_stats.get(&tid).unwrap_or(&(0, 0, 0));
trace_avgs.push(AvgTrace {
tid,
avg_emb: avg,
frame_count: stats.0,
});
}
// Push to Qdrant in batches
for chunk in trace_avgs.chunks(500) {
let batch: Vec<(u64, &[f32], Option<serde_json::Value>)> = chunk
.iter()
.map(|t| {
(
t.tid as u64,
t.avg_emb.as_slice(),
Some(serde_json::json!({
"trace_id": t.tid,
"file_uuid": file_uuid,
"frame_count": t.frame_count,
"source": "trace",
})),
)
})
.collect();
qdrant.upsert_vectors_batch(&collection, &batch).await?;
}
tracing::info!(
"Synced {} trace embeddings to Qdrant for {}",
trace_faces.len(),
file_uuid
);
Ok(())
}

View File

@@ -45,9 +45,11 @@ impl SyncDb {
}
let payload = VectorPayload {
uuid: uuid.clone(),
file_uuid: uuid.clone(),
chunk_id: chunk_id.clone(),
chunk_type,
start_frame: chunk.start_frame,
end_frame: chunk.end_frame,
start_time,
end_time,
text: Some(text.to_string()),

View File

@@ -33,26 +33,38 @@ pub async fn run_consistency_checks(db: &PostgresDb) -> ConsistencyReport {
// Check 1: stale_processing — status=processing but job_id is null
let c1 = check_stale_processing(db).await;
if c1.count > 0 { any_issue = true; }
if c1.count > 0 {
any_issue = true;
}
checks.push(c1);
// Check 2: orphaned_processing — status=processing but no active monitor_job
let c2 = check_orphaned_processing(db).await;
if c2.count > 0 { any_issue = true; }
if c2.count > 0 {
any_issue = true;
}
checks.push(c2);
// Check 3: unregistered_with_uuid — DB rows left behind by migration
let c3 = check_unregistered_with_uuid(db).await;
if c3.count > 0 { any_issue = true; }
if c3.count > 0 {
any_issue = true;
}
checks.push(c3);
// Check 4: processing_job_done — status=processing but job already completed
let c4 = check_processing_job_done(db).await;
if c4.count > 0 { any_issue = true; }
if c4.count > 0 {
any_issue = true;
}
checks.push(c4);
ConsistencyReport {
status: if any_issue { "degraded".to_string() } else { "ok".to_string() },
status: if any_issue {
"degraded".to_string()
} else {
"ok".to_string()
},
checked_at,
checks,
}
@@ -68,9 +80,17 @@ async fn check_stale_processing(db: &PostgresDb) -> ConsistencyCheck {
.await
.unwrap_or_default();
let files: Vec<ConsistencyFile> = rows.into_iter().map(|(file_uuid, file_name, status): (String, String, String)| ConsistencyFile {
file_uuid, file_name, status, detail: "job_id is null".to_string(),
}).collect();
let files: Vec<ConsistencyFile> = rows
.into_iter()
.map(
|(file_uuid, file_name, status): (String, String, String)| ConsistencyFile {
file_uuid,
file_name,
status,
detail: "job_id is null".to_string(),
},
)
.collect();
ConsistencyCheck {
check: "stale_processing".to_string(),
@@ -83,19 +103,28 @@ async fn check_stale_processing(db: &PostgresDb) -> ConsistencyCheck {
async fn check_orphaned_processing(db: &PostgresDb) -> ConsistencyCheck {
let vt = schema::table_name("videos");
let mj = schema::table_name("monitor_jobs");
let rows: Vec<(String, String, String)> = sqlx::query_as::<_, (String, String, String)>(&format!(
"SELECT v.file_uuid, v.file_name, v.status \
let rows: Vec<(String, String, String)> =
sqlx::query_as::<_, (String, String, String)>(&format!(
"SELECT v.file_uuid, v.file_name, v.status \
FROM {} v LEFT JOIN {} m ON v.file_uuid = m.uuid AND m.status IN ('pending','running') \
WHERE v.status = 'processing' AND m.id IS NULL",
vt, mj
))
.fetch_all(db.pool())
.await
.unwrap_or_default();
vt, mj
))
.fetch_all(db.pool())
.await
.unwrap_or_default();
let files: Vec<ConsistencyFile> = rows.into_iter().map(|(file_uuid, file_name, status): (String, String, String)| ConsistencyFile {
file_uuid, file_name, status, detail: "no active monitor_job".to_string(),
}).collect();
let files: Vec<ConsistencyFile> = rows
.into_iter()
.map(
|(file_uuid, file_name, status): (String, String, String)| ConsistencyFile {
file_uuid,
file_name,
status,
detail: "no active monitor_job".to_string(),
},
)
.collect();
ConsistencyCheck {
check: "orphaned_processing".to_string(),
@@ -107,17 +136,26 @@ async fn check_orphaned_processing(db: &PostgresDb) -> ConsistencyCheck {
async fn check_unregistered_with_uuid(db: &PostgresDb) -> ConsistencyCheck {
let vt = schema::table_name("videos");
let rows: Vec<(String, String, String)> = sqlx::query_as::<_, (String, String, String)>(&format!(
"SELECT file_uuid, file_name, status FROM {} WHERE status = 'unregistered'",
vt
))
.fetch_all(db.pool())
.await
.unwrap_or_default();
let rows: Vec<(String, String, String)> =
sqlx::query_as::<_, (String, String, String)>(&format!(
"SELECT file_uuid, file_name, status FROM {} WHERE status = 'unregistered'",
vt
))
.fetch_all(db.pool())
.await
.unwrap_or_default();
let files: Vec<ConsistencyFile> = rows.into_iter().map(|(file_uuid, file_name, status): (String, String, String)| ConsistencyFile {
file_uuid, file_name, status, detail: "migration residue".to_string(),
}).collect();
let files: Vec<ConsistencyFile> = rows
.into_iter()
.map(
|(file_uuid, file_name, status): (String, String, String)| ConsistencyFile {
file_uuid,
file_name,
status,
detail: "migration residue".to_string(),
},
)
.collect();
ConsistencyCheck {
check: "unregistered_with_uuid".to_string(),
@@ -130,19 +168,28 @@ async fn check_unregistered_with_uuid(db: &PostgresDb) -> ConsistencyCheck {
async fn check_processing_job_done(db: &PostgresDb) -> ConsistencyCheck {
let vt = schema::table_name("videos");
let mj = schema::table_name("monitor_jobs");
let rows: Vec<(String, String, String)> = sqlx::query_as::<_, (String, String, String)>(&format!(
"SELECT v.file_uuid, v.file_name, v.status \
let rows: Vec<(String, String, String)> =
sqlx::query_as::<_, (String, String, String)>(&format!(
"SELECT v.file_uuid, v.file_name, v.status \
FROM {} v JOIN {} m ON v.file_uuid = m.uuid \
WHERE v.status = 'processing' AND m.status = 'completed'",
vt, mj
))
.fetch_all(db.pool())
.await
.unwrap_or_default();
vt, mj
))
.fetch_all(db.pool())
.await
.unwrap_or_default();
let files: Vec<ConsistencyFile> = rows.into_iter().map(|(file_uuid, file_name, status): (String, String, String)| ConsistencyFile {
file_uuid, file_name, status, detail: "monitor_job already completed".to_string(),
}).collect();
let files: Vec<ConsistencyFile> = rows
.into_iter()
.map(
|(file_uuid, file_name, status): (String, String, String)| ConsistencyFile {
file_uuid,
file_name,
status,
detail: "monitor_job already completed".to_string(),
},
)
.collect();
ConsistencyCheck {
check: "processing_job_done".to_string(),

View File

@@ -54,8 +54,7 @@ pub fn read_identity_file(uuid: &str) -> Result<IdentityFile> {
let path = identity_file_path(uuid);
let content = std::fs::read_to_string(&path)
.with_context(|| format!("Identity file not found: {} ({})", uuid, path.display()))?;
serde_json::from_str(&content)
.with_context(|| format!("Invalid identity.json: {}", uuid))
serde_json::from_str(&content).with_context(|| format!("Invalid identity.json: {}", uuid))
}
pub fn write_identity_file(file: &IdentityFile) -> Result<()> {
@@ -167,7 +166,10 @@ pub fn rebuild_index() -> Result<usize> {
entries.insert(uuid.clone(), file.name);
}
Err(e) => {
warn!("[identity-storage] Skipping {} in index rebuild: {}", uuid, e);
warn!(
"[identity-storage] Skipping {} in index rebuild: {}",
uuid, e
);
}
}
}
@@ -187,18 +189,16 @@ pub async fn save_identity_file_by_pool(pool: &sqlx::PgPool, uuid: &str) -> Resu
let identity_table = crate::core::db::schema::table_name("identities");
let fd_table = crate::core::db::schema::table_name("face_detections");
// Schema-aware column selection: dev uses 'name', public uses 'real_name'
let name_col = if identity_table.starts_with("dev.") { "name" } else { "real_name" };
let clean = uuid.replace('-', "");
let record = sqlx::query_as::<_, crate::core::db::IdentityDetailRecord>(
&format!(
"SELECT id, uuid::text, {} AS name, identity_type, source, status, metadata, reference_data, \
NULL::real[] as voice_embedding, NULL::real[] as identity_embedding, \
face_embedding::real[] as face_embedding, \
tmdb_id, tmdb_profile, created_at::timestamptz as created_at, NULL::timestamptz as updated_at \
FROM {} WHERE REPLACE(uuid::text, '-', '') = $1",
name_col, identity_table
"SELECT id, uuid::text, name, identity_type, source, status, metadata, reference_data, \
NULL::real[] as voice_embedding, NULL::real[] as identity_embedding, \
face_embedding::real[] as face_embedding, \
tmdb_id, tmdb_profile, created_at::timestamptz as created_at, NULL::timestamptz as updated_at \
FROM {} WHERE REPLACE(uuid::text, '-', '') = $1",
identity_table
)
)
.bind(&clean)
@@ -322,8 +322,13 @@ pub fn update_index_at(base: &std::path::Path, uuid: &str, name: &str) -> Result
let mut entries: HashMap<String, String> = if index_path.exists() {
let content = std::fs::read_to_string(&index_path)?;
let v: serde_json::Value = serde_json::from_str(&content).unwrap_or_default();
v["entries"].as_object()
.map(|obj| obj.iter().map(|(k, v)| (k.clone(), v.as_str().unwrap_or("").to_string())).collect())
v["entries"]
.as_object()
.map(|obj| {
obj.iter()
.map(|(k, v)| (k.clone(), v.as_str().unwrap_or("").to_string()))
.collect()
})
.unwrap_or_default()
} else {
HashMap::new()
@@ -338,7 +343,9 @@ pub fn update_index_at(base: &std::path::Path, uuid: &str, name: &str) -> Result
}
pub async fn save_identity_file(db: &PostgresDb, uuid: &str) -> Result<()> {
let record = db.get_identity_by_uuid(uuid).await?
let record = db
.get_identity_by_uuid(uuid)
.await?
.with_context(|| format!("Identity not found in DB: {}", uuid))?;
let identity_uuid = record.uuid.clone();
@@ -415,6 +422,7 @@ mod tests {
status: Some("confirmed".to_string()),
tmdb_id: Some(112),
tmdb_profile: Some("https://image.tmdb.org/t/p/w185/test.jpg".to_string()),
local_profile: None,
metadata: serde_json::json!({"tmdb_character": "Test Role"}),
file_bindings: vec![FileBinding {
file_uuid: "ffffffffffffffffffffffffffffffff".to_string(),
@@ -442,7 +450,9 @@ mod tests {
fn test_identity_dir_path() {
let uuid = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa";
let p = identity_dir(uuid);
assert!(p.to_string_lossy().ends_with(&format!("identities/{}", uuid)));
assert!(p
.to_string_lossy()
.ends_with(&format!("identities/{}", uuid)));
}
#[test]
@@ -463,7 +473,10 @@ mod tests {
let base = Path::new("/tmp/test_base");
let uuid = "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb";
let p = identity_dir_at(base, uuid);
assert_eq!(p, Path::new("/tmp/test_base/identities/bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"));
assert_eq!(
p,
Path::new("/tmp/test_base/identities/bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb")
);
}
#[test]
@@ -490,7 +503,10 @@ mod tests {
assert_eq!(read.name, file.name);
assert_eq!(read.source, file.source);
assert_eq!(read.tmdb_id, file.tmdb_id);
assert_eq!(read.file_bindings[0].face_count, file.file_bindings[0].face_count);
assert_eq!(
read.file_bindings[0].face_count,
file.file_bindings[0].face_count
);
let _ = std::fs::remove_dir_all(&tmp);
}
@@ -521,9 +537,21 @@ mod tests {
let _ = std::fs::remove_dir_all(&tmp);
let base = &tmp;
std::fs::create_dir_all(base.join("identities").join("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa")).unwrap();
std::fs::create_dir_all(base.join("identities").join("bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb")).unwrap();
std::fs::create_dir_all(base.join("identities").join("cccccccccccccccccccccccccccccccc")).unwrap();
std::fs::create_dir_all(
base.join("identities")
.join("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"),
)
.unwrap();
std::fs::create_dir_all(
base.join("identities")
.join("bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"),
)
.unwrap();
std::fs::create_dir_all(
base.join("identities")
.join("cccccccccccccccccccccccccccccccc"),
)
.unwrap();
std::fs::create_dir_all(base.join("identities").join("not_a_uuid")).unwrap();
std::fs::create_dir_all(base.join("identities").join("short")).unwrap();

View File

@@ -56,19 +56,25 @@ impl IngestionService {
.to_string();
// 1. Compute SHA256 for dedup
let content_hash = crate::core::storage::content_hash::compute_sha256(&canonical_path).ok().unwrap_or_default();
let content_hash = crate::core::storage::content_hash::compute_sha256(&canonical_path)
.ok()
.unwrap_or_default();
// 2. Hash check — same content = already registered
let videos_table = schema::table_name("videos");
if !content_hash.is_empty() {
if let Ok(Some(existing_uuid)) = sqlx::query_scalar::<_, String>(
&format!("SELECT file_uuid FROM {} WHERE content_hash = $1 LIMIT 1", videos_table)
)
if let Ok(Some(existing_uuid)) = sqlx::query_scalar::<_, String>(&format!(
"SELECT file_uuid FROM {} WHERE content_hash = $1 LIMIT 1",
videos_table
))
.bind(&content_hash)
.fetch_optional(self.db.pool())
.await
{
info!("Content already registered: {} ({})", filename, existing_uuid);
info!(
"Content already registered: {} ({})",
filename, existing_uuid
);
return Ok(Some(existing_uuid));
}
}
@@ -108,7 +114,8 @@ impl IngestionService {
let probe_result = probe::probe_video(file_path).ok();
let file_meta = std::fs::metadata(&canonical_path).ok();
let duration = probe_result.as_ref()
let duration = probe_result
.as_ref()
.and_then(|r| r.format.duration.as_ref())
.and_then(|s| s.parse::<f64>().ok())
.unwrap_or(0.0);
@@ -148,7 +155,11 @@ impl IngestionService {
}
let total_frames = {
let video_stream = probe_result.as_ref().and_then(|pr| pr.streams.iter().find(|s| s.codec_type.as_deref() == Some("video")));
let video_stream = probe_result.as_ref().and_then(|pr| {
pr.streams
.iter()
.find(|s| s.codec_type.as_deref() == Some("video"))
});
if let Some(stream) = video_stream {
if let Some(nb_frames_str) = &stream.nb_frames {
@@ -223,11 +234,14 @@ impl IngestionService {
// Store content_hash for dedup
if !content_hash.is_empty() {
let vt = schema::table_name("videos");
let _ = sqlx::query(&format!("UPDATE {} SET content_hash = $1 WHERE file_uuid = $2", vt))
.bind(&content_hash)
.bind(&uuid)
.execute(self.db.pool())
.await;
let _ = sqlx::query(&format!(
"UPDATE {} SET content_hash = $1 WHERE file_uuid = $2",
vt
))
.bind(&content_hash)
.bind(&uuid)
.execute(self.db.pool())
.await;
}
self.db
@@ -243,5 +257,3 @@ impl IngestionService {
Ok(Some(uuid))
}
}

View File

@@ -17,42 +17,84 @@ mod tests {
#[test]
fn test_detect_category_image() {
assert_eq!(detect_category(Path::new("photo.jpg")), FileCategory::Image);
assert_eq!(detect_category(Path::new("photo.jpeg")), FileCategory::Image);
assert_eq!(
detect_category(Path::new("photo.jpeg")),
FileCategory::Image
);
assert_eq!(detect_category(Path::new("photo.png")), FileCategory::Image);
assert_eq!(detect_category(Path::new("photo.svg")), FileCategory::Image);
assert_eq!(detect_category(Path::new("photo.webp")), FileCategory::Image);
assert_eq!(
detect_category(Path::new("photo.webp")),
FileCategory::Image
);
}
#[test]
fn test_detect_category_document() {
assert_eq!(detect_category(Path::new("doc.pdf")), FileCategory::Document);
assert_eq!(detect_category(Path::new("doc.docx")), FileCategory::Document);
assert_eq!(detect_category(Path::new("doc.pages")), FileCategory::Document);
assert_eq!(detect_category(Path::new("doc.txt")), FileCategory::Document);
assert_eq!(
detect_category(Path::new("doc.pdf")),
FileCategory::Document
);
assert_eq!(
detect_category(Path::new("doc.docx")),
FileCategory::Document
);
assert_eq!(
detect_category(Path::new("doc.pages")),
FileCategory::Document
);
assert_eq!(
detect_category(Path::new("doc.txt")),
FileCategory::Document
);
}
#[test]
fn test_detect_category_spreadsheet() {
assert_eq!(detect_category(Path::new("data.xlsx")), FileCategory::Spreadsheet);
assert_eq!(detect_category(Path::new("data.csv")), FileCategory::Spreadsheet);
assert_eq!(detect_category(Path::new("data.numbers")), FileCategory::Spreadsheet);
assert_eq!(
detect_category(Path::new("data.xlsx")),
FileCategory::Spreadsheet
);
assert_eq!(
detect_category(Path::new("data.csv")),
FileCategory::Spreadsheet
);
assert_eq!(
detect_category(Path::new("data.numbers")),
FileCategory::Spreadsheet
);
}
#[test]
fn test_detect_category_presentation() {
assert_eq!(detect_category(Path::new("deck.pptx")), FileCategory::Presentation);
assert_eq!(detect_category(Path::new("deck.key")), FileCategory::Presentation);
assert_eq!(
detect_category(Path::new("deck.pptx")),
FileCategory::Presentation
);
assert_eq!(
detect_category(Path::new("deck.key")),
FileCategory::Presentation
);
}
#[test]
fn test_detect_category_archive() {
assert_eq!(detect_category(Path::new("files.zip")), FileCategory::Archive);
assert_eq!(detect_category(Path::new("files.tar.gz")), FileCategory::Archive);
assert_eq!(
detect_category(Path::new("files.zip")),
FileCategory::Archive
);
assert_eq!(
detect_category(Path::new("files.tar.gz")),
FileCategory::Archive
);
}
#[test]
fn test_detect_category_unknown() {
assert_eq!(detect_category(Path::new("file.xyz")), FileCategory::Unknown);
assert_eq!(
detect_category(Path::new("file.xyz")),
FileCategory::Unknown
);
assert_eq!(detect_category(Path::new("file")), FileCategory::Unknown);
}
@@ -84,13 +126,18 @@ pub enum FileCategory {
/// Detect file category from path extension
pub fn detect_category(path: &Path) -> FileCategory {
let ext = path.extension()
let ext = path
.extension()
.and_then(|e| e.to_str())
.map(|e| e.to_lowercase());
match ext.as_deref() {
Some("mp4" | "mov" | "mkv" | "avi" | "webm" | "m4v" | "mpeg") => FileCategory::Video,
Some("jpg" | "jpeg" | "png" | "gif" | "bmp" | "webp" | "svg" | "heic" | "tiff") => FileCategory::Image,
Some("pdf" | "doc" | "docx" | "odt" | "pages" | "rtf" | "txt" | "md" | "rst") => FileCategory::Document,
Some("jpg" | "jpeg" | "png" | "gif" | "bmp" | "webp" | "svg" | "heic" | "tiff") => {
FileCategory::Image
}
Some("pdf" | "doc" | "docx" | "odt" | "pages" | "rtf" | "txt" | "md" | "rst") => {
FileCategory::Document
}
Some("xls" | "xlsx" | "csv" | "ods" | "numbers") => FileCategory::Spreadsheet,
Some("ppt" | "pptx" | "odp" | "key") => FileCategory::Presentation,
Some("zip" | "tar" | "gz" | "tgz" | "7z" | "rar") => FileCategory::Archive,
@@ -102,16 +149,20 @@ pub fn detect_category(path: &Path) -> FileCategory {
pub fn base_format_info(path: &Path) -> serde_json::Value {
let meta = std::fs::metadata(path).ok();
let size = meta.as_ref().map(|m| m.len()).unwrap_or(0);
let mtime = meta.as_ref()
let mtime = meta
.as_ref()
.and_then(|m| m.modified().ok())
.and_then(|t| {
let secs = t.duration_since(SystemTime::UNIX_EPOCH).ok()?.as_secs() as i64;
chrono::DateTime::from_timestamp(secs, 0)
.map(|dt| dt.to_rfc3339())
chrono::DateTime::from_timestamp(secs, 0).map(|dt| dt.to_rfc3339())
})
.unwrap_or_default();
let fname = path.to_string_lossy().to_string();
let ext = path.extension().and_then(|e| e.to_str()).unwrap_or("").to_lowercase();
let ext = path
.extension()
.and_then(|e| e.to_str())
.unwrap_or("")
.to_lowercase();
let cat = detect_category(path);
let file_type = match cat {
FileCategory::Video => "video",
@@ -150,7 +201,13 @@ fn ffprobe_probe(path: &Path, format_base: serde_json::Value) -> serde_json::Val
}
/// Run Python probe for document/spreadsheet/presentation files
fn python_probe(path: &Path, category: &FileCategory, scripts_dir: &str, python_path: &str, format_base: serde_json::Value) -> serde_json::Value {
fn python_probe(
path: &Path,
category: &FileCategory,
scripts_dir: &str,
python_path: &str,
format_base: serde_json::Value,
) -> serde_json::Value {
let script = format!("{}/probe_file.py", scripts_dir);
if !std::path::Path::new(&script).exists() {
return minimal_probe(format_base);
@@ -184,18 +241,12 @@ fn minimal_probe(format_base: serde_json::Value) -> serde_json::Value {
/// Unified probe: dispatches to the right probe based on file type
/// Returns a probe_json-compatible Value
pub async fn unified_probe(
path: &Path,
scripts_dir: &str,
python_path: &str,
) -> serde_json::Value {
pub async fn unified_probe(path: &Path, scripts_dir: &str, python_path: &str) -> serde_json::Value {
let cat = detect_category(path);
let format_base = base_format_info(path);
match cat {
FileCategory::Video | FileCategory::Image => {
ffprobe_probe(path, format_base)
}
FileCategory::Video | FileCategory::Image => ffprobe_probe(path, format_base),
FileCategory::Document | FileCategory::Spreadsheet | FileCategory::Presentation => {
python_probe(path, &cat, scripts_dir, python_path, format_base)
}

View File

@@ -1,5 +1,6 @@
use anyhow::{Context, Result};
use serde::{Deserialize, Serialize};
use std::process::Command;
use std::time::Duration;
use super::executor::PythonExecutor;
@@ -27,13 +28,21 @@ pub async fn process_cut(
output_path: &str,
uuid: Option<&str>,
) -> Result<CutResult> {
// Try native ffmpeg-based scene detection first
let result = try_native_cut(video_path);
if let Ok(r) = result {
let json = serde_json::to_string_pretty(&r)?;
std::fs::write(output_path, &json)
.with_context(|| format!("Failed to write {:?}", output_path))?;
return Ok(r);
}
// Fallback: Python scenedetect
tracing::warn!("[CUT] Native impl failed, falling back to Python");
let executor = PythonExecutor::new()?;
let script_path = executor.script_path("cut_processor.py");
tracing::info!("[CUT] Starting scene detection: {}", video_path);
if !script_path.exists() {
tracing::warn!("[CUT] Script not found, returning empty result");
return Ok(CutResult {
frame_count: 0,
fps: 0.0,
@@ -53,19 +62,179 @@ pub async fn process_cut(
.with_context(|| format!("Failed to run {:?}", script_path))?;
let json_str = std::fs::read_to_string(output_path).context("Failed to read CUT output")?;
let result: CutResult =
serde_json::from_str(&json_str).context("Failed to parse CUT output")?;
tracing::info!("[CUT] Result: {} scenes detected", result.scenes.len());
Ok(result)
}
// ── Native ffmpeg scene detection ─────────────────────────────────
fn try_native_cut(video_path: &str) -> Result<CutResult> {
// Step 1: Get video info (fps, frame count)
let probe = Command::new("ffprobe")
.args([
"-v",
"quiet",
"-print_format",
"json",
"-show_format",
"-show_streams",
video_path,
])
.output()
.context("Failed to run ffprobe")?;
let probe_info: serde_json::Value =
serde_json::from_slice(&probe.stdout).context("Failed to parse ffprobe output")?;
let streams = probe_info["streams"]
.as_array()
.map_or(vec![], |s| s.clone());
let video_stream = streams.iter().find(|s| s["codec_type"] == "video");
let fps = video_stream
.and_then(|s| s["r_frame_rate"].as_str().and_then(parse_fraction))
.unwrap_or(30.0);
let total_frames: u64 = video_stream
.and_then(|s| s["nb_frames"].as_str())
.and_then(|s| s.parse().ok())
.unwrap_or(0);
let duration: f64 = probe_info["format"]["duration"]
.as_str()
.and_then(|s| s.parse().ok())
.unwrap_or(0.0);
// Step 2: Use ffmpeg scene detection filter
// The `scene` filter computes the difference between consecutive frames
// and outputs when the difference exceeds the threshold (0.3 = medium sensitivity)
let scene_output = Command::new("ffprobe")
.args([
"-v",
"quiet",
"-show_entries",
"frame=pts_time",
"-of",
"compact=p=0:nk=1",
"-f",
"lavfi",
&format!("movie={},select='gt(scene\\,0.3)',showinfo", video_path),
"-show_frames",
])
.output()
.context("Failed to run ffmpeg scene detection")?;
let stderr_output = String::from_utf8_lossy(&scene_output.stderr);
let mut scene_times: Vec<f64> = Vec::new();
// Parse ffmpeg showinfo output for scene changes
// Format: [Parsed_showinfo...] pts:123.456 pts_time:123.456 ...
for line in stderr_output.lines() {
if line.contains("pts_time:") {
if let Some(pos) = line.find("pts_time:") {
let rest = &line[pos + 9..];
let time_str = rest.split_whitespace().next().unwrap_or("");
if let Ok(t) = time_str.parse::<f64>() {
scene_times.push(t);
}
}
}
}
// Step 3: Build scenes from cut points
let mut scenes: Vec<CutScene> = Vec::new();
let mut prev_time = 0.0;
let mut prev_frame: u64 = 0;
for (i, &cut_time) in scene_times.iter().enumerate() {
let end_frame = (cut_time * fps).round() as u64;
let start_frame = prev_frame;
if end_frame > start_frame {
scenes.push(CutScene {
scene_number: (i + 1) as u32,
start_frame: prev_frame,
end_frame: end_frame.saturating_sub(1),
start_time: prev_time,
end_time: cut_time - (1.0 / fps),
});
}
prev_time = cut_time;
prev_frame = end_frame;
}
// Final scene (last cut point → end of video)
if total_frames > 0 && prev_frame < total_frames {
scenes.push(CutScene {
scene_number: (scenes.len() + 1) as u32,
start_frame: prev_frame,
end_frame: total_frames.saturating_sub(1),
start_time: prev_time,
end_time: duration,
});
}
// If no scenes detected, create a single scene covering the whole video
if scenes.is_empty() && total_frames > 0 {
scenes.push(CutScene {
scene_number: 1,
start_frame: 0,
end_frame: total_frames.saturating_sub(1),
start_time: 0.0,
end_time: duration,
});
}
Ok(CutResult {
frame_count: total_frames,
fps,
scenes,
})
}
/// Parse fractional fps like "30000/1001" into f64
fn parse_fraction(s: &str) -> Option<f64> {
if let Some(pos) = s.find('/') {
let num: f64 = s[..pos].parse().ok()?;
let den: f64 = s[pos + 1..].parse().ok()?;
if den > 0.0 {
return Some(num / den);
}
}
s.parse::<f64>().ok()
}
// ── Tests ─────────────────────────────────────────────────────────
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_parse_fraction() {
let r = parse_fraction("30000/1001").unwrap();
assert!((r - 29.97).abs() < 0.01);
}
#[test]
fn test_parse_fraction_int() {
let r = parse_fraction("30").unwrap();
assert!((r - 30.0).abs() < 0.01);
}
#[test]
fn test_parse_fraction_invalid() {
assert!(parse_fraction("not/a/num").is_none());
}
#[test]
fn test_parse_fraction_zero_den() {
assert!(parse_fraction("1/0").is_none());
}
#[test]
fn test_cut_result_serialization() {
let result = CutResult {
@@ -81,8 +250,9 @@ mod tests {
};
let json = serde_json::to_string(&result).unwrap();
assert!(json.contains("frame_count"));
assert!(json.contains("scene_number"));
assert!(json.contains("1"));
assert!(json.contains("fps"));
}
#[test]
@@ -90,20 +260,23 @@ mod tests {
let json = r#"{
"frame_count": 100,
"fps": 30.0,
"scenes": [
{"scene_number": 1, "start_frame": 0, "end_frame": 30, "start_time": 0.0, "end_time": 1.0},
{"scene_number": 2, "start_frame": 31, "end_frame": 60, "start_time": 1.033, "end_time": 2.0}
]
"scenes": [{
"scene_number": 1,
"start_frame": 0,
"end_frame": 30,
"start_time": 0.0,
"end_time": 1.0
}]
}"#;
let result: CutResult = serde_json::from_str(json).unwrap();
assert_eq!(result.frame_count, 100);
assert_eq!(result.scenes.len(), 2);
assert_eq!(result.scenes[1].scene_number, 2);
assert_eq!(result.scenes.len(), 1);
assert_eq!(result.scenes[0].scene_number, 1);
assert_eq!(result.scenes[0].start_frame, 0);
}
#[test]
fn test_cut_result_empty_scenes() {
fn test_empty_scenes() {
let result = CutResult {
frame_count: 0,
fps: 0.0,
@@ -111,17 +284,4 @@ mod tests {
};
assert!(result.scenes.is_empty());
}
#[test]
fn test_cut_scene_times() {
let scene = CutScene {
scene_number: 1,
start_frame: 0,
end_frame: 30,
start_time: 0.0,
end_time: 1.0,
};
assert!(scene.end_time > scene.start_time);
assert_eq!(scene.scene_number, 1);
}
}

View File

@@ -109,11 +109,10 @@ pub fn validate_python_env() -> Result<()> {
tracing::warn!("Expected Python 3.11, got: {}", version.trim());
}
let scripts_dir = std::env::var("MOMENTRY_SCRIPTS_DIR")
.unwrap_or_else(|_| {
let manifest = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
manifest.join("scripts").to_string_lossy().to_string()
});
let scripts_dir = std::env::var("MOMENTRY_SCRIPTS_DIR").unwrap_or_else(|_| {
let manifest = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
manifest.join("scripts").to_string_lossy().to_string()
});
let script_path = PathBuf::from(&scripts_dir);
if !script_path.exists() {
anyhow::bail!("Scripts directory not found at {}", scripts_dir);
@@ -133,11 +132,10 @@ impl PythonExecutor {
pub fn new() -> Result<Self> {
let python_path = std::env::var("MOMENTRY_PYTHON_PATH")
.unwrap_or_else(|_| "/opt/homebrew/bin/python3.11".to_string());
let scripts_dir = std::env::var("MOMENTRY_SCRIPTS_DIR")
.unwrap_or_else(|_| {
let manifest = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
manifest.join("scripts").to_string_lossy().to_string()
});
let scripts_dir = std::env::var("MOMENTRY_SCRIPTS_DIR").unwrap_or_else(|_| {
let manifest = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
manifest.join("scripts").to_string_lossy().to_string()
});
let python_bin = PathBuf::from(&python_path);
let scripts_path = PathBuf::from(&scripts_dir);
@@ -173,7 +171,8 @@ impl PythonExecutor {
if let Some(expected_hash) = self.checksums.get(&rel_path) {
let output = std::process::Command::new("shasum")
.arg("-a").arg("256")
.arg("-a")
.arg("256")
.arg(&script_path)
.output()
.context("Failed to run shasum for integrity check")?;
@@ -235,8 +234,9 @@ impl PythonExecutor {
}
// Verify script integrity via SHA256 checksum before execution
self.verify_script_integrity(script_name)
.context("Pre-execution integrity check failed — possible version mismatch or corruption")?;
self.verify_script_integrity(script_name).context(
"Pre-execution integrity check failed — possible version mismatch or corruption",
)?;
// 標記輸出檔為處理中add .tmp suffix
let output_path = args.get(1).map(|p| std::path::PathBuf::from(p));

View File

@@ -44,22 +44,59 @@ pub enum CrowdSize {
/// Indoor-indicative YOLO classes (COCO labels)
const INDOOR_CLASSES: &[&str] = &[
"chair", "couch", "bed", "dining table", "toilet", "tv", "laptop",
"microwave", "oven", "refrigerator", "sink", "book", "clock",
"vase", "potted plant",
"chair",
"couch",
"bed",
"dining table",
"toilet",
"tv",
"laptop",
"microwave",
"oven",
"refrigerator",
"sink",
"book",
"clock",
"vase",
"potted plant",
];
/// Vehicle-indicative classes (person + vehicle = transport scene)
const VEHICLE_CLASSES: &[&str] = &[
"car", "truck", "bus", "train", "boat", "aeroplane", "bicycle", "motorbike",
"car",
"truck",
"bus",
"train",
"boat",
"aeroplane",
"bicycle",
"motorbike",
];
/// Outdoor-indicative YOLO classes
const OUTDOOR_CLASSES: &[&str] = &[
"car", "truck", "bus", "train", "boat", "airplane",
"traffic light", "fire hydrant", "stop sign", "parking meter",
"bench", "bird", "cat", "dog", "horse", "sheep", "cow", "elephant",
"bear", "zebra", "giraffe", "tree",
"car",
"truck",
"bus",
"train",
"boat",
"airplane",
"traffic light",
"fire hydrant",
"stop sign",
"parking meter",
"bench",
"bird",
"cat",
"dog",
"horse",
"sheep",
"cow",
"elephant",
"bear",
"zebra",
"giraffe",
"tree",
];
/// Build heuristic scene metadata from disk files (yolo.json + DB face data).
@@ -113,13 +150,14 @@ pub async fn build_heuristic_scene_meta(
// Get face counts grouped by frame
let fd_table = schema::table_name("face_detections");
let face_rows: Vec<(i64, i64)> = sqlx::query_as(
&format!("SELECT frame_number, COUNT(*) as fc \
let face_rows: Vec<(i64, i64)> = sqlx::query_as(&format!(
"SELECT frame_number, COUNT(*) as fc \
FROM {} \
WHERE file_uuid = $1 AND frame_number IS NOT NULL \
GROUP BY frame_number \
ORDER BY frame_number", fd_table),
)
ORDER BY frame_number",
fd_table
))
.bind(file_uuid)
.fetch_all(pool)
.await
@@ -166,7 +204,10 @@ pub async fn build_heuristic_scene_meta(
let outdoor_ratio = outdoor_objects as f64 / frame_count.max(1) as f64;
let total_indicator = indoor_ratio + outdoor_ratio;
let (indoor_score, outdoor_score) = if total_indicator > 0.0 {
(indoor_ratio / total_indicator, outdoor_ratio / total_indicator)
(
indoor_ratio / total_indicator,
outdoor_ratio / total_indicator,
)
} else {
(0.5, 0.5)
};
@@ -187,17 +228,13 @@ pub async fn build_heuristic_scene_meta(
.map(|c| class_frame_presence.get(*c).copied().unwrap_or(0))
.sum();
let person_ratio = person_frames as f64 / frame_count.max(1) as f64;
let likely_vehicle = person_ratio > 0.5 && vehicle_frames > 0
&& outdoor_score > 0.3;
let likely_vehicle = person_ratio > 0.5 && vehicle_frames > 0 && outdoor_score > 0.3;
// Dominant objects: rank by frame presence (not total count)
let mut sorted: Vec<_> = class_frame_presence.into_iter().collect();
sorted.sort_by(|a, b| b.1.cmp(&a.1));
let dominant_objects: Vec<String> = sorted
.iter()
.take(3)
.map(|(cls, _)| cls.clone())
.collect();
let dominant_objects: Vec<String> =
sorted.iter().take(3).map(|(cls, _)| cls.clone()).collect();
segments.push(SceneSegmentMeta {
segment_index: idx as u32 + 1,
@@ -229,12 +266,15 @@ pub async fn build_heuristic_scene_meta(
/// Full pipeline entry point: reads CUT data, generates heuristic metadata, writes JSON.
/// Called from job_worker post-processing trigger.
pub async fn generate_scene_meta(db: &crate::core::db::PostgresDb, file_uuid: &str) -> Result<usize> {
pub async fn generate_scene_meta(
db: &crate::core::db::PostgresDb,
file_uuid: &str,
) -> Result<usize> {
let pool = db.pool();
// Read CUT segment boundaries from cut.json
let cut_path = Path::new(crate::core::config::OUTPUT_DIR.as_str())
.join(format!("{}.cut.json", file_uuid));
let cut_path =
Path::new(crate::core::config::OUTPUT_DIR.as_str()).join(format!("{}.cut.json", file_uuid));
let segments: Vec<(i64, i64, f64, f64)> = if cut_path.exists() {
let cut_str = tokio::fs::read_to_string(&cut_path)
.await
@@ -250,8 +290,7 @@ pub async fn generate_scene_meta(db: &crate::core::db::PostgresDb, file_uuid: &s
start_time: f64,
end_time: f64,
}
let cut: CutJson = serde_json::from_str(&cut_str)
.context("Failed to parse cut.json")?;
let cut: CutJson = serde_json::from_str(&cut_str).context("Failed to parse cut.json")?;
cut.scenes
.into_iter()
.map(|s| (s.start_frame, s.end_frame, s.start_time, s.end_time))
@@ -259,9 +298,10 @@ pub async fn generate_scene_meta(db: &crate::core::db::PostgresDb, file_uuid: &s
} else {
// Fallback: query DB for video duration, make one segment
let videos_table = schema::table_name("videos");
let (total_frames, duration): (Option<i64>, Option<f64>) = sqlx::query_as(
&format!("SELECT total_frames, duration FROM {} WHERE file_uuid = $1", videos_table),
)
let (total_frames, duration): (Option<i64>, Option<f64>) = sqlx::query_as(&format!(
"SELECT total_frames, duration FROM {} WHERE file_uuid = $1",
videos_table
))
.bind(file_uuid)
.fetch_optional(pool)
.await

View File

@@ -10,6 +10,7 @@ pub mod ocr;
pub mod pose;
pub mod scene_classification;
pub mod story;
pub mod tkg;
pub mod visual_chunk;
pub mod yolo;
@@ -25,7 +26,8 @@ pub use face_recognition::{
RecognizedFaceDetection,
};
pub use heuristic_scene::{
build_heuristic_scene_meta, generate_scene_meta, CrowdSize, HeuristicSceneMeta, SceneSegmentMeta,
build_heuristic_scene_meta, generate_scene_meta, CrowdSize, HeuristicSceneMeta,
SceneSegmentMeta,
};
pub use ocr::{process_ocr, OcrFrame, OcrResult, OcrText};
pub use pose::{process_pose, Bbox, Keypoint, PersonPose, PoseFrame, PoseResult};
@@ -34,5 +36,6 @@ pub use scene_classification::{
SceneSegment,
};
pub use story::{process_story, StoryChildChunk, StoryParentChunk, StoryResult, StoryStats};
pub use tkg::{build_tkg, TkgResult};
pub use visual_chunk::{process_visual_chunk, process_visual_chunk_advanced, VisualChunkResult};
pub use yolo::{process_yolo, YoloFrame, YoloObject, YoloResult};

View File

@@ -106,7 +106,10 @@ pub async fn process_story(
}
// Fallback: Python script
tracing::warn!("[STORY] Native impl failed, falling back to Python: {:?}", result.err());
tracing::warn!(
"[STORY] Native impl failed, falling back to Python: {:?}",
result.err()
);
let executor = PythonExecutor::new()?;
let script_path = executor.script_path("story_processor.py");
@@ -145,7 +148,11 @@ pub async fn process_story(
// ── Native implementation ─────────────────────────────────────────
fn try_native_story(_video_path: &str, output_path: &str, _uuid: Option<&str>) -> Result<StoryResult> {
fn try_native_story(
_video_path: &str,
output_path: &str,
_uuid: Option<&str>,
) -> Result<StoryResult> {
let output_dir = Path::new(output_path).parent().unwrap_or(Path::new("."));
let basename = Path::new(output_path)
.file_stem()
@@ -160,8 +167,7 @@ fn try_native_story(_video_path: &str, output_path: &str, _uuid: Option<&str>) -
let asr_data: AsrData = if asr_path.exists() {
let content = std::fs::read_to_string(&asr_path)
.with_context(|| format!("Failed to read {:?}", asr_path))?;
serde_json::from_str(&content)
.with_context(|| format!("Failed to parse {:?}", asr_path))?
serde_json::from_str(&content).with_context(|| format!("Failed to parse {:?}", asr_path))?
} else {
AsrData { segments: vec![] }
};
@@ -169,8 +175,7 @@ fn try_native_story(_video_path: &str, output_path: &str, _uuid: Option<&str>) -
let cut_data: CutData = if cut_path.exists() {
let content = std::fs::read_to_string(&cut_path)
.with_context(|| format!("Failed to read {:?}", cut_path))?;
serde_json::from_str(&content)
.with_context(|| format!("Failed to parse {:?}", cut_path))?
serde_json::from_str(&content).with_context(|| format!("Failed to parse {:?}", cut_path))?
} else {
CutData { scenes: vec![] }
};
@@ -376,22 +381,39 @@ fn generate_narrative(texts: &[String], objects: &[String], start: f64, end: f64
let mut unique: Vec<&String> = objects.iter().collect();
unique.sort();
unique.dedup();
let objs = unique.iter().take(5).map(|s| (*s).as_str()).collect::<Vec<_>>().join(", ");
let objs = unique
.iter()
.take(5)
.map(|s| (*s).as_str())
.collect::<Vec<_>>()
.join(", ");
parts.push(format!("Visuals: {}", objs));
}
format!("[{:.0}s-{:.0}s] {}", start, end, parts.join(" | "))
}
fn generate_scene_narrative(objects: &[String], start: f64, end: f64, scene_count: usize) -> String {
fn generate_scene_narrative(
objects: &[String],
start: f64,
end: f64,
scene_count: usize,
) -> String {
let mut unique: Vec<&String> = objects.iter().collect();
unique.sort();
unique.dedup();
let top5: Vec<&String> = unique.iter().take(5).cloned().collect();
if !top5.is_empty() {
let obj_str = top5.iter().map(|s| s.as_str()).collect::<Vec<_>>().join(", ");
format!("[{:.0}s-{:.0}s] {} scenes. Visuals: {}.", start, end, scene_count, obj_str)
let obj_str = top5
.iter()
.map(|s| s.as_str())
.collect::<Vec<_>>()
.join(", ");
format!(
"[{:.0}s-{:.0}s] {} scenes. Visuals: {}.",
start, end, scene_count, obj_str
)
} else {
format!("[{:.0}s-{:.0}s] {} video scenes.", start, end, scene_count)
}
@@ -408,7 +430,8 @@ mod tests {
let text = generate_narrative(
&["Hello world".to_string()],
&["person".to_string()],
0.0, 5.0,
0.0,
5.0,
);
assert!(text.contains("[0s-5s]"));
assert!(text.contains("Speech:"));
@@ -576,7 +599,10 @@ mod tests {
};
assert_eq!(result.parent_chunks[0].child_chunk_ids.len(), 2);
assert!(result.child_chunks.iter().all(|c| c.parent_chunk_id.is_some()));
assert!(result
.child_chunks
.iter()
.all(|c| c.parent_chunk_id.is_some()));
assert!(result.parent_chunks[0].parent_chunk_id.is_none());
}
@@ -594,11 +620,7 @@ mod tests {
std::fs::write(&asr_path, r#"{"segments":[]}"#).unwrap();
std::fs::write(&cut_path, r#"{"scenes":[]}"#).unwrap();
let result = try_native_story(
"/dummy.mp4",
out_path.to_str().unwrap(),
None,
).unwrap();
let result = try_native_story("/dummy.mp4", out_path.to_str().unwrap(), None).unwrap();
assert_eq!(result.stats.total_child_chunks, 0);
assert_eq!(result.stats.total_parent_chunks, 0);
@@ -616,13 +638,17 @@ mod tests {
let cut_path = dir.join(format!("{}.cut.json", basename));
let out_path = dir.join(format!("{}.story.json", basename));
std::fs::write(&asr_path, r#"{
std::fs::write(
&asr_path,
r#"{
"segments": [
{"start": 0.0, "end": 2.5, "text": "Hello", "confidence": 0.95},
{"start": 2.5, "end": 5.0, "text": "World", "confidence": 0.92},
{"start": 5.0, "end": 7.5, "text": "Foo", "confidence": 0.90}
]
}"#).unwrap();
}"#,
)
.unwrap();
std::fs::write(&cut_path, r#"{
"scenes": [
@@ -631,11 +657,7 @@ mod tests {
]
}"#).unwrap();
let result = try_native_story(
"/dummy.mp4",
out_path.to_str().unwrap(),
None,
).unwrap();
let result = try_native_story("/dummy.mp4", out_path.to_str().unwrap(), None).unwrap();
assert_eq!(result.stats.asr_children, 3);
assert_eq!(result.stats.cut_children, 2);
@@ -649,7 +671,11 @@ mod tests {
for child in &result.child_chunks {
if child.source == "asr" {
assert!(child.parent_chunk_id.is_some());
assert!(child.parent_chunk_id.as_ref().unwrap().starts_with("story_asr_"));
assert!(child
.parent_chunk_id
.as_ref()
.unwrap()
.starts_with("story_asr_"));
}
}

703
src/core/processor/tkg.rs Normal file
View File

@@ -0,0 +1,703 @@
use anyhow::{Context, Result};
use serde::Deserialize;
use sqlx::PgPool;
use std::collections::HashMap;
use std::path::Path;
use crate::core::db::postgres_db::PostgresDb;
fn t(name: &str) -> String {
let schema = std::env::var("DATABASE_SCHEMA").unwrap_or_else(|_| "dev".to_string());
if schema == "public" {
name.to_string()
} else {
format!("{}.{}", schema, name)
}
}
// ── Input data structs ────────────────────────────────────────────
#[derive(Debug, Deserialize)]
struct YoloJson {
#[serde(default)]
frames: HashMap<String, YoloFrameEntry>,
}
#[derive(Debug, Deserialize)]
struct YoloFrameEntry {
#[serde(default)]
detections: Vec<YoloDetEntry>,
#[serde(default)]
objects: Vec<YoloDetEntry>,
}
#[derive(Debug, Deserialize)]
struct YoloDetEntry {
#[serde(default)]
class_name: String,
#[serde(default)]
confidence: f64,
}
#[derive(Debug, Deserialize)]
struct AsrxJson {
#[serde(default)]
segments: Vec<AsrxSegmentEntry>,
#[serde(default)]
speaker_stats: Option<HashMap<String, AsrxSpeakerStat>>,
}
#[derive(Debug, Deserialize)]
struct AsrxSegmentEntry {
#[serde(default)]
speaker_id: String,
#[serde(default)]
start_time: f64,
#[serde(default)]
end_time: f64,
#[allow(dead_code)]
start_frame: i64,
#[allow(dead_code)]
end_frame: i64,
}
#[derive(Debug, Deserialize)]
struct AsrxSpeakerStat {
#[serde(default)]
count: i64,
}
// ── Face detection trace ──────────────────────────────────────────
#[derive(Debug, sqlx::FromRow)]
struct FaceTraceRow {
trace_id: i64,
frame_count: i64,
start_f: i64,
end_f: i64,
avg_x: Option<f64>,
avg_y: Option<f64>,
avg_w: Option<f64>,
avg_h: Option<f64>,
}
#[derive(Debug, sqlx::FromRow)]
struct FaceDetectionRow {
trace_id: i64,
frame_number: i64,
#[allow(dead_code)]
x: Option<f64>,
#[allow(dead_code)]
y: Option<f64>,
#[allow(dead_code)]
width: Option<f64>,
#[allow(dead_code)]
height: Option<f64>,
}
// ── Public API ────────────────────────────────────────────────────
pub struct TkgResult {
pub face_trace_nodes: usize,
pub object_nodes: usize,
pub speaker_nodes: usize,
pub co_occurrence_edges: usize,
pub speaker_face_edges: usize,
pub face_face_edges: usize,
}
pub async fn build_tkg(db: &PostgresDb, file_uuid: &str, output_dir: &str) -> Result<TkgResult> {
let pool = db.pool();
let n_face = build_face_trace_nodes(pool, file_uuid).await?;
let n_objects = build_yolo_object_nodes(pool, file_uuid, output_dir).await?;
let n_speakers = build_speaker_nodes(pool, file_uuid, output_dir).await?;
let e_co = build_co_occurrence_edges(pool, file_uuid, output_dir).await?;
let e_sf = build_speaker_face_edges(pool, file_uuid, output_dir).await?;
let e_ff = build_face_face_edges(pool, file_uuid).await?;
Ok(TkgResult {
face_trace_nodes: n_face,
object_nodes: n_objects,
speaker_nodes: n_speakers,
co_occurrence_edges: e_co,
speaker_face_edges: e_sf,
face_face_edges: e_ff,
})
}
// ── Node builders ─────────────────────────────────────────────────
async fn build_face_trace_nodes(pool: &PgPool, file_uuid: &str) -> Result<usize> {
let face_table = t("face_detections");
let nodes_table = t("tkg_nodes");
let rows = sqlx::query_as::<_, FaceTraceRow>(&format!(
r#"
SELECT trace_id,
COUNT(*)::bigint as frame_count,
MIN(frame_number) as start_f,
MAX(frame_number) as end_f,
AVG(x::float8) as avg_x,
AVG(y::float8) as avg_y,
AVG(width::float8) as avg_w,
AVG(height::float8) as avg_h
FROM {}
WHERE file_uuid = $1 AND trace_id IS NOT NULL
GROUP BY trace_id
ORDER BY trace_id
"#,
face_table
))
.bind(file_uuid)
.fetch_all(pool)
.await?;
let mut count = 0;
for row in &rows {
let external_id = format!("trace_{}", row.trace_id);
let label = format!("Face Trace {}", row.trace_id);
let props = serde_json::json!({
"frame_count": row.frame_count,
"start_frame": row.start_f,
"end_frame": row.end_f,
"avg_bbox": {
"x": row.avg_x.unwrap_or(0.0).round() as i64,
"y": row.avg_y.unwrap_or(0.0).round() as i64,
"width": row.avg_w.unwrap_or(0.0).round() as i64,
"height": row.avg_h.unwrap_or(0.0).round() as i64,
}
});
sqlx::query(&format!(
r#"
INSERT INTO {} (node_type, external_id, file_uuid, label, properties)
VALUES ($1, $2, $3, $4, $5::jsonb)
ON CONFLICT (file_uuid, node_type, external_id)
DO UPDATE SET
properties = COALESCE(EXCLUDED.properties, tkg_nodes.properties),
label = COALESCE(NULLIF(EXCLUDED.label, ''), tkg_nodes.label)
"#,
nodes_table
))
.bind("face_trace")
.bind(&external_id)
.bind(file_uuid)
.bind(&label)
.bind(serde_json::to_string(&props)?)
.execute(pool)
.await?;
count += 1;
}
Ok(count)
}
async fn build_yolo_object_nodes(
pool: &PgPool,
file_uuid: &str,
output_dir: &str,
) -> Result<usize> {
let yolo_path = Path::new(output_dir).join(format!("{}.yolo.json", file_uuid));
if !yolo_path.exists() {
return Ok(0);
}
let content = std::fs::read_to_string(&yolo_path)
.with_context(|| format!("Failed to read {:?}", yolo_path))?;
let yolo: YoloJson = serde_json::from_str(&content)
.with_context(|| format!("Failed to parse {:?}", yolo_path))?;
let mut class_counts: HashMap<String, i64> = HashMap::new();
for fdata in yolo.frames.values() {
let dets = if !fdata.detections.is_empty() {
&fdata.detections
} else {
&fdata.objects
};
for det in dets {
*class_counts.entry(det.class_name.clone()).or_insert(0) += 1;
}
}
let nodes_table = t("tkg_nodes");
let mut count = 0;
for (cls, cnt) in &class_counts {
let props = serde_json::json!({ "total_detections": cnt });
sqlx::query(&format!(
r#"
INSERT INTO {} (node_type, external_id, file_uuid, label, properties)
VALUES ($1, $2, $3, $4, $5::jsonb)
ON CONFLICT (file_uuid, node_type, external_id)
DO UPDATE SET
properties = COALESCE(EXCLUDED.properties, tkg_nodes.properties)
"#,
nodes_table
))
.bind("object")
.bind(cls)
.bind(file_uuid)
.bind(cls)
.bind(serde_json::to_string(&props)?)
.execute(pool)
.await?;
count += 1;
}
Ok(count)
}
async fn build_speaker_nodes(pool: &PgPool, file_uuid: &str, output_dir: &str) -> Result<usize> {
let asrx_path = Path::new(output_dir).join(format!("{}.asrx.json", file_uuid));
if !asrx_path.exists() {
return Ok(0);
}
let content = std::fs::read_to_string(&asrx_path)
.with_context(|| format!("Failed to read {:?}", asrx_path))?;
let asrx: AsrxJson = serde_json::from_str(&content)
.with_context(|| format!("Failed to parse {:?}", asrx_path))?;
let stats = asrx.speaker_stats.unwrap_or_default();
let nodes_table = t("tkg_nodes");
let mut count = 0;
for (sid, stat) in &stats {
let props = serde_json::json!({ "segment_count": stat.count });
sqlx::query(&format!(
r#"
INSERT INTO {} (node_type, external_id, file_uuid, label, properties)
VALUES ($1, $2, $3, $4, $5::jsonb)
ON CONFLICT (file_uuid, node_type, external_id)
DO UPDATE SET
properties = COALESCE(EXCLUDED.properties, tkg_nodes.properties)
"#,
nodes_table
))
.bind("speaker")
.bind(sid)
.bind(file_uuid)
.bind(sid)
.bind(serde_json::to_string(&props)?)
.execute(pool)
.await?;
count += 1;
}
Ok(count)
}
// ── Edge builders ─────────────────────────────────────────────────
async fn build_co_occurrence_edges(
pool: &PgPool,
file_uuid: &str,
output_dir: &str,
) -> Result<usize> {
let yolo_path = Path::new(output_dir).join(format!("{}.yolo.json", file_uuid));
if !yolo_path.exists() {
return Ok(0);
}
let content = std::fs::read_to_string(&yolo_path)?;
let yolo: YoloJson = serde_json::from_str(&content)?;
let face_table = t("face_detections");
let nodes_table = t("tkg_nodes");
let edges_table = t("tkg_edges");
let face_rows = sqlx::query_as::<_, FaceDetectionRow>(&format!(
r#"SELECT trace_id, frame_number, x, y, width, height
FROM {} WHERE file_uuid = $1 AND trace_id IS NOT NULL
ORDER BY frame_number"#,
face_table
))
.bind(file_uuid)
.fetch_all(pool)
.await?;
let mut edge_count = 0;
for face in &face_rows {
let frame_str = face.frame_number.to_string();
let yolo_frame = match yolo.frames.get(&frame_str) {
Some(f) => f,
None => continue,
};
let dets = if !yolo_frame.detections.is_empty() {
&yolo_frame.detections
} else {
&yolo_frame.objects
};
if dets.is_empty() {
continue;
}
let external_id = format!("trace_{}", face.trace_id);
let face_node: Option<(i64,)> = sqlx::query_as(&format!(
"SELECT id FROM {} WHERE file_uuid=$1 AND node_type='face_trace' AND external_id=$2",
nodes_table
))
.bind(file_uuid)
.bind(&external_id)
.fetch_optional(pool)
.await?;
let face_node_id = match face_node {
Some((id,)) => id,
None => continue,
};
for det in dets {
let obj_node: Option<(i64,)> = sqlx::query_as(&format!(
"SELECT id FROM {} WHERE file_uuid=$1 AND node_type='object' AND external_id=$2",
nodes_table
))
.bind(file_uuid)
.bind(&det.class_name)
.fetch_optional(pool)
.await?;
let obj_node_id = match obj_node {
Some((id,)) => id,
None => continue,
};
let edge_props = serde_json::json!({
"frame": face.frame_number,
"object_confidence": det.confidence,
});
if let Err(e) = sqlx::query(&format!(
r#"
INSERT INTO {} (edge_type, source_node_id, target_node_id, file_uuid, properties)
VALUES ($1, $2, $3, $4, $5::jsonb)
ON CONFLICT (file_uuid, edge_type, source_node_id, target_node_id)
DO UPDATE SET properties = COALESCE(EXCLUDED.properties, tkg_edges.properties)
"#,
edges_table
))
.bind("CO_OCCURS_WITH")
.bind(face_node_id)
.bind(obj_node_id)
.bind(file_uuid)
.bind(serde_json::to_string(&edge_props)?)
.execute(pool)
.await
{
tracing::warn!(
"[TKG] Edge insert failed (trace={}, obj={}): {}",
face.trace_id,
det.class_name,
e
);
continue;
}
edge_count += 1;
}
}
Ok(edge_count)
}
async fn build_speaker_face_edges(
pool: &PgPool,
file_uuid: &str,
output_dir: &str,
) -> Result<usize> {
let asrx_path = Path::new(output_dir).join(format!("{}.asrx.json", file_uuid));
if !asrx_path.exists() {
return Ok(0);
}
let content = std::fs::read_to_string(&asrx_path)?;
let asrx: AsrxJson = serde_json::from_str(&content)?;
if asrx.segments.is_empty() {
return Ok(0);
}
let face_table = t("face_detections");
let nodes_table = t("tkg_nodes");
let edges_table = t("tkg_edges");
let traces = sqlx::query_as::<_, (i64, i64, i64)>(&format!(
r#"SELECT trace_id, MIN(frame_number) as start_f, MAX(frame_number) as end_f
FROM {} WHERE file_uuid = $1 AND trace_id IS NOT NULL
GROUP BY trace_id"#,
face_table
))
.bind(file_uuid)
.fetch_all(pool)
.await?;
// Calculate fps from last segment
let last = asrx.segments.last().unwrap();
let fps = if last.end_time > 0.0 {
last.end_frame as f64 / last.end_time
} else {
30.0
};
let mut edge_count = 0;
for (tid, sf, ef) in &traces {
let face_ext_id = format!("trace_{}", tid);
let face_node: Option<(i64,)> = sqlx::query_as(&format!(
"SELECT id FROM {} WHERE file_uuid=$1 AND node_type='face_trace' AND external_id=$2",
nodes_table
))
.bind(file_uuid)
.bind(&face_ext_id)
.fetch_optional(pool)
.await?;
let face_node_id = match face_node {
Some((id,)) => id,
None => continue,
};
let face_start_sec = *sf as f64 / fps;
let face_end_sec = *ef as f64 / fps;
for seg in &asrx.segments {
let seg_start = seg.start_time;
let seg_end = seg.end_time;
let overlap_start = face_start_sec.max(seg_start);
let overlap_end = face_end_sec.min(seg_end);
if overlap_start >= overlap_end {
continue;
}
let overlap_dur = overlap_end - overlap_start;
let face_dur = face_end_sec - face_start_sec;
if face_dur <= 0.0 {
continue;
}
let overlap_ratio = overlap_dur / face_dur;
if overlap_ratio < 0.3 {
continue;
}
let speaker_node: Option<(i64,)> = sqlx::query_as(&format!(
"SELECT id FROM {} WHERE file_uuid=$1 AND node_type='speaker' AND external_id=$2",
nodes_table
))
.bind(file_uuid)
.bind(&seg.speaker_id)
.fetch_optional(pool)
.await?;
let speaker_node_id = match speaker_node {
Some((id,)) => id,
None => continue,
};
let edge_props = serde_json::json!({
"overlap_ratio": (overlap_ratio * 1000.0).round() / 1000.0,
"overlap_duration_s": (overlap_dur * 10.0).round() / 10.0,
"face_time_range": format!("{:.1}-{:.1}s", face_start_sec, face_end_sec),
"speaker_time_range": format!("{:.1}-{:.1}s", seg_start, seg_end),
});
sqlx::query(&format!(
r#"
INSERT INTO {} (edge_type, source_node_id, target_node_id, file_uuid, properties)
VALUES ($1, $2, $3, $4, $5::jsonb)
ON CONFLICT (file_uuid, edge_type, source_node_id, target_node_id)
DO UPDATE SET properties = COALESCE(EXCLUDED.properties, tkg_edges.properties)
"#,
edges_table
))
.bind("SPEAKS_AS")
.bind(face_node_id)
.bind(speaker_node_id)
.bind(file_uuid)
.bind(serde_json::to_string(&edge_props)?)
.execute(pool)
.await?;
edge_count += 1;
}
}
Ok(edge_count)
}
async fn build_face_face_edges(pool: &PgPool, file_uuid: &str) -> Result<usize> {
let face_table = t("face_detections");
let nodes_table = t("tkg_nodes");
let edges_table = t("tkg_edges");
let rows: Vec<(i64, i64, i64)> = sqlx::query_as(&format!(
r#"
SELECT a.trace_id AS tid_a, b.trace_id AS tid_b, a.frame_number
FROM {} a
JOIN {} b
ON a.file_uuid = b.file_uuid
AND a.frame_number = b.frame_number
AND a.trace_id < b.trace_id
WHERE a.file_uuid = $1
AND a.trace_id IS NOT NULL
AND b.trace_id IS NOT NULL
ORDER BY a.frame_number
"#,
face_table, face_table
))
.bind(file_uuid)
.fetch_all(pool)
.await?;
if rows.is_empty() {
return Ok(0);
}
// Deduplicate by pair
let mut pair_frames: HashMap<(i64, i64), Vec<i64>> = HashMap::new();
for (tid_a, tid_b, frame) in &rows {
let key = if *tid_a < *tid_b {
(*tid_a, *tid_b)
} else {
(*tid_b, *tid_a)
};
pair_frames.entry(key).or_default().push(*frame);
}
let mut edge_count = 0;
for ((tid_a, tid_b), frames) in &pair_frames {
let ext_a = format!("trace_{}", tid_a);
let ext_b = format!("trace_{}", tid_b);
let n_a: Option<(i64,)> = sqlx::query_as(&format!(
"SELECT id FROM {} WHERE file_uuid=$1 AND node_type='face_trace' AND external_id=$2",
nodes_table
))
.bind(file_uuid)
.bind(&ext_a)
.fetch_optional(pool)
.await?;
let n_b: Option<(i64,)> = sqlx::query_as(&format!(
"SELECT id FROM {} WHERE file_uuid=$1 AND node_type='face_trace' AND external_id=$2",
nodes_table
))
.bind(file_uuid)
.bind(&ext_b)
.fetch_optional(pool)
.await?;
let (n_a_id, n_b_id) = match (n_a, n_b) {
(Some((a,)), Some((b,))) => (a, b),
_ => continue,
};
let edge_props = serde_json::json!({
"first_frame": frames[0],
"frame_count": frames.len() as i64,
});
sqlx::query(&format!(
r#"
INSERT INTO {} (edge_type, source_node_id, target_node_id, file_uuid, properties)
VALUES ($1, $2, $3, $4, $5::jsonb)
ON CONFLICT (file_uuid, edge_type, source_node_id, target_node_id)
DO UPDATE SET properties = COALESCE(EXCLUDED.properties, tkg_edges.properties)
"#,
edges_table
))
.bind("CO_OCCURS_WITH")
.bind(n_a_id)
.bind(n_b_id)
.bind(file_uuid)
.bind(serde_json::to_string(&edge_props)?)
.execute(pool)
.await?;
edge_count += 1;
}
Ok(edge_count)
}
// ── Tests ─────────────────────────────────────────────────────────
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_yolo_json_deserialize() {
let json = r#"{
"frames": {
"1": {"time_seconds": 0.0, "detections": [{"class_name": "person", "confidence": 0.9}]},
"2": {"time_seconds": 1.0, "detections": [{"class_name": "chair", "confidence": 0.8}]}
}
}"#;
let yolo: YoloJson = serde_json::from_str(json).unwrap();
assert_eq!(yolo.frames.len(), 2);
assert_eq!(yolo.frames["1"].detections[0].class_name, "person");
}
#[test]
fn test_yolo_json_empty_frames() {
let json = r#"{"frames": {}}"#;
let yolo: YoloJson = serde_json::from_str(json).unwrap();
assert!(yolo.frames.is_empty());
}
#[test]
fn test_asrx_json_deserialize() {
let json = r#"{
"segments": [
{"speaker_id": "SPEAKER_01", "start_time": 0.0, "end_time": 2.0, "start_frame": 0, "end_frame": 60}
],
"speaker_stats": {"SPEAKER_01": {"count": 1}}
}"#;
let asrx: AsrxJson = serde_json::from_str(json).unwrap();
assert_eq!(asrx.segments.len(), 1);
assert_eq!(asrx.segments[0].speaker_id, "SPEAKER_01");
}
#[test]
fn test_asrx_json_no_stats() {
let json = r#"{"segments": []}"#;
let asrx: AsrxJson = serde_json::from_str(json).unwrap();
assert!(asrx.speaker_stats.is_none());
}
#[test]
fn test_yolo_objects_fallback() {
let json = r#"{
"frames": {
"1": {"objects": [{"class_name": "person"}]}
}
}"#;
let yolo: YoloJson = serde_json::from_str(json).unwrap();
assert_eq!(yolo.frames["1"].objects[0].class_name, "person");
assert!(yolo.frames["1"].detections.is_empty());
}
#[test]
fn test_tkg_result() {
let r = TkgResult {
face_trace_nodes: 5,
object_nodes: 10,
speaker_nodes: 3,
co_occurrence_edges: 20,
speaker_face_edges: 8,
face_face_edges: 4,
};
assert_eq!(r.face_trace_nodes, 5);
assert_eq!(r.object_nodes, 10);
assert_eq!(r.speaker_nodes, 3);
}
}

View File

@@ -1,7 +1,7 @@
use anyhow::Result;
use sha2::{Digest, Sha256};
use std::io::Read;
use std::path::Path;
use anyhow::Result;
/// Compute SHA256 of the entire file content
pub fn compute_sha256(path: &Path) -> Result<String> {
@@ -10,7 +10,9 @@ pub fn compute_sha256(path: &Path) -> Result<String> {
let mut buf = [0u8; 65536];
loop {
let n = file.read(&mut buf)?;
if n == 0 { break; }
if n == 0 {
break;
}
hasher.update(&buf[..n]);
}
let hash = format!("{:x}", hasher.finalize());

View File

@@ -65,7 +65,11 @@ pub fn tmdb_cache_path(file_uuid: &str) -> PathBuf {
pub fn read_tmdb_cache(file_uuid: &str) -> Result<TmdbCache> {
let path = tmdb_cache_path(file_uuid);
if !path.exists() {
anyhow::bail!("TMDb cache not found: {} (expected: {})", file_uuid, path.display());
anyhow::bail!(
"TMDb cache not found: {} (expected: {})",
file_uuid,
path.display()
);
}
let content = std::fs::read_to_string(&path)
.with_context(|| format!("Failed to read TMDb cache: {}", path.display()))?;
@@ -96,9 +100,7 @@ pub fn count_cache_files() -> usize {
match std::fs::read_dir(&dir) {
Ok(entries) => entries
.filter_map(|e| e.ok())
.filter(|e| {
e.file_name().to_string_lossy().ends_with(".tmdb.json")
})
.filter(|e| e.file_name().to_string_lossy().ends_with(".tmdb.json"))
.count(),
Err(_) => 0,
}

View File

@@ -46,11 +46,12 @@ pub async fn match_faces_against_tmdb(db: &PostgresDb, file_uuid: &str) -> Resul
// Step 2: Load face_detections grouped by trace_id
let fd_table = schema::table_name("face_detections");
let fd_rows = sqlx::query_as::<_, (i32, Vec<f32>)>(
&format!("SELECT trace_id, embedding FROM {} \
let fd_rows = sqlx::query_as::<_, (i32, Vec<f32>)>(&format!(
"SELECT trace_id, embedding FROM {} \
WHERE file_uuid=$1 AND trace_id IS NOT NULL AND embedding IS NOT NULL \
ORDER BY trace_id", fd_table),
)
ORDER BY trace_id",
fd_table
))
.bind(file_uuid)
.fetch_all(pool)
.await?;
@@ -156,9 +157,10 @@ pub async fn match_faces_against_tmdb(db: &PostgresDb, file_uuid: &str) -> Resul
let fd_table = schema::table_name("face_detections");
let mut after_qc = HashMap::new();
for (&tid, &(id, ref name)) in &matched {
let cnt: i64 = sqlx::query_scalar(
&format!("SELECT COUNT(*) FROM {} WHERE file_uuid=$1 AND trace_id=$2", fd_table),
)
let cnt: i64 = sqlx::query_scalar(&format!(
"SELECT COUNT(*) FROM {} WHERE file_uuid=$1 AND trace_id=$2",
fd_table
))
.bind(file_uuid)
.bind(tid)
.fetch_one(pool)
@@ -194,9 +196,10 @@ pub async fn match_faces_against_tmdb(db: &PostgresDb, file_uuid: &str) -> Resul
// Step 5: Update DB
let mut updated = 0usize;
for (&tid, &(id, _)) in &matched {
let r = sqlx::query(
&format!("UPDATE {} SET identity_id=$1 WHERE file_uuid=$2 AND trace_id=$3", fd_table),
)
let r = sqlx::query(&format!(
"UPDATE {} SET identity_id=$1 WHERE file_uuid=$2 AND trace_id=$3",
fd_table
))
.bind(id)
.bind(file_uuid)
.bind(tid)
@@ -223,9 +226,8 @@ pub async fn match_faces_against_tmdb(db: &PostgresDb, file_uuid: &str) -> Resul
async fn quality_check_temporal_collisions(pool: &sqlx::PgPool, file_uuid: &str) -> Result<usize> {
let fd_table = schema::table_name("face_detections");
// Find all collision pairs: same identity, same frame, different trace
let collisions = sqlx::query_as::<_, (i32, i32, i32, i32)>(
&format!(
"SELECT a.identity_id, a.trace_id, b.trace_id, a.frame_number \
let collisions = sqlx::query_as::<_, (i32, i32, i32, i32)>(&format!(
"SELECT a.identity_id, a.trace_id, b.trace_id, a.frame_number \
FROM {} a \
JOIN {} b \
ON a.file_uuid = b.file_uuid \
@@ -235,9 +237,8 @@ async fn quality_check_temporal_collisions(pool: &sqlx::PgPool, file_uuid: &str)
AND a.identity_id IS NOT NULL \
AND a.identity_id = b.identity_id \
ORDER BY a.identity_id, a.frame_number",
fd_table, fd_table
),
)
fd_table, fd_table
))
.bind(file_uuid)
.fetch_all(pool)
.await?;
@@ -256,25 +257,36 @@ async fn quality_check_temporal_collisions(pool: &sqlx::PgPool, file_uuid: &str)
let mut unbound = 0usize;
for ((id, ta, tb), overlap_frames) in &collision_groups {
// Get face detection count for each trace
let cnt_a: i64 = sqlx::query_scalar(
&format!("SELECT COUNT(*) FROM {} WHERE file_uuid=$1 AND trace_id=$2 AND identity_id=$3", fd_table)
)
.bind(file_uuid).bind(ta).bind(id)
.fetch_one(pool).await.unwrap_or(0);
let cnt_a: i64 = sqlx::query_scalar(&format!(
"SELECT COUNT(*) FROM {} WHERE file_uuid=$1 AND trace_id=$2 AND identity_id=$3",
fd_table
))
.bind(file_uuid)
.bind(ta)
.bind(id)
.fetch_one(pool)
.await
.unwrap_or(0);
let cnt_b: i64 = sqlx::query_scalar(
&format!("SELECT COUNT(*) FROM {} WHERE file_uuid=$1 AND trace_id=$2 AND identity_id=$3", fd_table)
)
.bind(file_uuid).bind(tb).bind(id)
.fetch_one(pool).await.unwrap_or(0);
let cnt_b: i64 = sqlx::query_scalar(&format!(
"SELECT COUNT(*) FROM {} WHERE file_uuid=$1 AND trace_id=$2 AND identity_id=$3",
fd_table
))
.bind(file_uuid)
.bind(tb)
.bind(id)
.fetch_one(pool)
.await
.unwrap_or(0);
// Unbind the trace with fewer detections (likely the false positive)
let victim = if cnt_a <= cnt_b { *ta } else { *tb };
let victim_cnt = if cnt_a <= cnt_b { cnt_a } else { cnt_b };
sqlx::query(
&format!("UPDATE {} SET identity_id=NULL WHERE file_uuid=$1 AND trace_id=$2", fd_table),
)
sqlx::query(&format!(
"UPDATE {} SET identity_id=NULL WHERE file_uuid=$1 AND trace_id=$2",
fd_table
))
.bind(file_uuid)
.bind(victim)
.execute(pool)

View File

@@ -45,7 +45,14 @@ fn extract_movie_name(filename: &str) -> Option<String> {
.file_stem()
.and_then(|s| s.to_str())?;
let cleaned = name.replace(['.', '_'], " ").trim().to_string();
// Take only the part before year patterns or separators
let cleaned = name
.replace(['.', '_'], " ")
.split(|c: char| c == '(' || c == '[' || c == '│' || c == '|')
.next()
.unwrap_or(&name)
.trim()
.to_string();
if cleaned.is_empty() || cleaned.len() < 3 {
return None;
@@ -53,10 +60,7 @@ fn extract_movie_name(filename: &str) -> Option<String> {
Some(cleaned)
}
pub async fn probe_from_cache(
db: &PostgresDb,
file_uuid: &str,
) -> Result<TmdbProbeResult> {
pub async fn probe_from_cache(db: &PostgresDb, file_uuid: &str) -> Result<TmdbProbeResult> {
let cache = crate::core::tmdb::cache::read_tmdb_cache(file_uuid)?;
if cache.identities.is_empty() && !cache.cast.is_empty() {
return create_identities_from_data(db, file_uuid, &cache.movie, &cache.cast).await;
@@ -83,7 +87,8 @@ async fn upsert_identities_from_disk(
}
match std::fs::read_to_string(&path) {
Ok(content) => {
match serde_json::from_str::<crate::core::identity::storage::IdentityFile>(&content) {
match serde_json::from_str::<crate::core::identity::storage::IdentityFile>(&content)
{
Ok(identity_file) => {
let identities_table = crate::core::db::schema::table_name("identities");
let result = sqlx::query(&format!(
@@ -106,21 +111,35 @@ async fn upsert_identities_from_disk(
match result {
Ok(_) => {
info!("[TMDB] Upserted identity: {} (uuid={})", identity_file.name, identity_file.identity_uuid);
info!(
"[TMDB] Upserted identity: {} (uuid={})",
identity_file.name, identity_file.identity_uuid
);
identities_created += 1;
}
Err(e) => {
warn!("[TMDB] Failed to upsert identity '{}': {}", identity_file.name, e);
warn!(
"[TMDB] Failed to upsert identity '{}': {}",
identity_file.name, e
);
}
}
}
Err(e) => {
warn!("[TMDB] Failed to parse identity file {}: {}", path.display(), e);
warn!(
"[TMDB] Failed to parse identity file {}: {}",
path.display(),
e
);
}
}
}
Err(e) => {
warn!("[TMDB] Failed to read identity file {}: {}", path.display(), e);
warn!(
"[TMDB] Failed to read identity file {}: {}",
path.display(),
e
);
}
}
}
@@ -181,7 +200,9 @@ pub async fn create_identities_from_data(
continue;
}
let profile_url = member.profile_path.as_ref()
let profile_url = member
.profile_path
.as_ref()
.map(|p| format!("https://image.tmdb.org/t/p/w185{}", p));
let metadata = serde_json::json!({
@@ -226,8 +247,13 @@ pub async fn create_identities_from_data(
member.name, member.character, uuid_str
);
identities_created += 1;
if let Err(e) = crate::core::identity::storage::save_identity_file(db, &uuid_str).await {
warn!("[TMDB] Failed to save identity file for {}: {}", member.name, e);
if let Err(e) =
crate::core::identity::storage::save_identity_file(db, &uuid_str).await
{
warn!(
"[TMDB] Failed to save identity file for {}: {}",
member.name, e
);
}
// Download and save TMDb profile image locally
if let Some(url) = &profile_url {
@@ -393,8 +419,10 @@ pub async fn probe_movie(
overview: movie.overview.clone(),
poster_path: movie.poster_path.clone(),
};
let cache_cast: Vec<cache::TmdbCastMember> = credits.cast.iter().map(|m| {
cache::TmdbCastMember {
let cache_cast: Vec<cache::TmdbCastMember> = credits
.cast
.iter()
.map(|m| cache::TmdbCastMember {
id: m.id,
name: m.name.clone(),
character: m.character.clone(),
@@ -410,8 +438,8 @@ pub async fn probe_movie(
deathday: None,
gender: None,
homepage: None,
}
}).collect();
})
.collect();
// Write TMDb cache so probe_from_cache can be used next time
let cache_obj = cache::TmdbCache {

View File

@@ -60,7 +60,11 @@ pub async fn check_tmdb_api() -> TmdbResourceStatus {
enabled: *config::tmdb::PROBE_ENABLED,
api_reachable: Some(reachable),
api_latency_ms: Some(latency),
api_error: if reachable { None } else { Some(format!("HTTP {}", resp.status())) },
api_error: if reachable {
None
} else {
Some(format!("HTTP {}", resp.status()))
},
last_check_at: Some(chrono::Utc::now().to_rfc3339()),
}
}
@@ -84,9 +88,10 @@ pub fn count_cache_files() -> usize {
pub async fn count_tmdb_identities(pool: &sqlx::PgPool) -> Result<i64> {
let identities_table = crate::core::db::schema::table_name("identities");
let count: i64 = sqlx::query_scalar(
&format!("SELECT COUNT(*) FROM {} WHERE source = 'tmdb'", identities_table)
)
let count: i64 = sqlx::query_scalar(&format!(
"SELECT COUNT(*) FROM {} WHERE source = 'tmdb'",
identities_table
))
.fetch_one(pool)
.await?;
Ok(count)
@@ -94,9 +99,10 @@ pub async fn count_tmdb_identities(pool: &sqlx::PgPool) -> Result<i64> {
pub async fn count_tmdb_identities_with_embedding(pool: &sqlx::PgPool) -> Result<i64> {
let identities_table = crate::core::db::schema::table_name("identities");
let count: i64 = sqlx::query_scalar(
&format!("SELECT COUNT(*) FROM {} WHERE source = 'tmdb' AND face_embedding IS NOT NULL", identities_table)
)
let count: i64 = sqlx::query_scalar(&format!(
"SELECT COUNT(*) FROM {} WHERE source = 'tmdb' AND face_embedding IS NOT NULL",
identities_table
))
.fetch_one(pool)
.await?;
Ok(count)