feat: ASRX hybrid pipeline, identity history, worker fixes, checkpoint system

This commit is contained in:
Accusys
2026-06-02 07:13:23 +08:00
parent e3066c3f49
commit e1572907ae
198 changed files with 43705 additions and 8910 deletions

View File

@@ -0,0 +1,604 @@
use anyhow::{Context, Result};
use bson::{doc, oid::ObjectId, DateTime as BsonDateTime, Document};
use chrono::{DateTime, Utc};
use mongodb::{Client, Collection, Database, IndexModel};
use serde::{Deserialize, Serialize};
use serde_json::Value as JsonValue;
use uuid::Uuid;
const COLLECTION_NAME: &str = "identity_merge_history";
fn bson_doc_to_json(doc: &Document) -> JsonValue {
match bson::to_bson(doc) {
Ok(bson) => bson.into_relaxed_extjson(),
Err(_) => JsonValue::Null,
}
}
fn json_value_to_bson_doc(value: &JsonValue) -> Document {
bson::to_document(value).unwrap_or_default()
}
fn doc_field_to_json(doc: &Document, key: &str) -> JsonValue {
doc.get(key)
.map(|b| b.clone().into_relaxed_extjson())
.unwrap_or(JsonValue::Null)
}
fn json_to_bson(value: &JsonValue) -> bson::Bson {
bson::to_bson(value).unwrap_or(bson::Bson::Null)
}
#[derive(Debug, Clone)]
pub struct IdentityMergeHistory {
pub id: Option<ObjectId>,
pub merge_id: String,
pub source_identity: IdentitySnapshot,
pub target_identity: TargetIdentitySnapshot,
pub aliases_added_to_target: Vec<AliasEntry>,
pub metadata_fields_added: Vec<String>,
pub faces_transferred: FacesTransferred,
pub merge_params: MergeParams,
pub merged_at: DateTime<Utc>,
pub undo_deadline: DateTime<Utc>,
pub undone: bool,
pub undone_at: Option<DateTime<Utc>>,
pub undone_by: Option<String>,
pub undone_snapshot: Option<UndoneSnapshot>,
pub undo_expired: bool,
}
#[derive(Debug, Clone)]
pub struct IdentitySnapshot {
pub id: i64,
pub uuid: String,
pub name: String,
pub identity_type: Option<String>,
pub source: Option<String>,
pub status: String,
pub tmdb_id: Option<i64>,
pub tmdb_profile: Option<String>,
pub metadata: JsonValue,
pub created_at: Option<DateTime<Utc>>,
pub face_count: i64,
}
#[derive(Debug, Clone)]
pub struct TargetIdentitySnapshot {
pub id: i64,
pub uuid: String,
pub name: String,
pub metadata_before: JsonValue,
pub metadata_after: Option<JsonValue>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct AliasEntry {
pub name: String,
pub locale: String,
#[serde(skip_serializing_if = "Option::is_none")]
pub source: Option<String>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct FacesTransferred {
pub file_uuid: String,
pub face_ids: Vec<String>,
pub trace_ids: Vec<i32>,
pub count: i64,
}
#[derive(Debug, Clone)]
pub struct UndoneSnapshot {
pub source_identity_id: i64,
pub source_uuid: String,
pub source_name: String,
pub target_metadata_at_undo: JsonValue,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct MergeParams {
pub keep_history: bool,
pub cleared_stranger_id: bool,
pub performed_by_user: Option<String>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct MergeHistoryQuery {
pub source_uuid: Option<String>,
pub target_uuid: Option<String>,
pub merge_id: Option<String>,
pub undone: Option<bool>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct MergeHistoryEntry {
pub merge_id: String,
pub source_name: String,
pub target_name: String,
pub faces_transferred: i64,
pub merged_at: DateTime<Utc>,
pub undo_deadline: DateTime<Utc>,
pub undone: bool,
pub undo_expired: bool,
}
impl IdentityMergeHistory {
pub fn from_document(doc: &Document) -> Result<Self> {
let source = doc
.get_document("source_identity")
.context("Missing source_identity")?;
let target = doc
.get_document("target_identity")
.context("Missing target_identity")?;
let faces = doc
.get_document("faces_transferred")
.context("Missing faces_transferred")?;
let aliases = doc
.get_array("aliases_added_to_target")
.unwrap_or(&vec![])
.clone();
let fields = doc
.get_array("metadata_fields_added")
.unwrap_or(&vec![])
.clone();
let merge_params_doc = doc
.get_document("merge_params")
.unwrap_or(&Document::new())
.clone();
let mut parsed_aliases = Vec::new();
for a in aliases {
if let Some(d) = a.as_document() {
parsed_aliases.push(AliasEntry {
name: d.get_str("name").unwrap_or("").to_string(),
locale: d.get_str("locale").unwrap_or("en").to_string(),
source: d.get_str("source").ok().map(|s| s.to_string()),
});
}
}
let mut parsed_fields = Vec::new();
for f in fields {
if let Some(s) = f.as_str() {
parsed_fields.push(s.to_string());
}
}
let undone_snapshot = doc.get_document("undone_snapshot").ok().and_then(|d| {
let sid = d.get_i64("source_identity_id").unwrap_or(0);
let suuid = d.get_str("source_uuid").unwrap_or("").to_string();
let sname = d.get_str("source_name").unwrap_or("").to_string();
let meta = doc_field_to_json(d, "target_metadata_at_undo");
Some(UndoneSnapshot {
source_identity_id: sid,
source_uuid: suuid,
source_name: sname,
target_metadata_at_undo: meta,
})
});
Ok(IdentityMergeHistory {
id: doc.get_object_id("_id").ok(),
merge_id: doc.get_str("merge_id").unwrap_or("").to_string(),
source_identity: IdentitySnapshot {
id: source.get_i64("id").unwrap_or(0),
uuid: source.get_str("uuid").unwrap_or("").to_string(),
name: source.get_str("name").unwrap_or("").to_string(),
identity_type: source.get_str("identity_type").ok().map(|s| s.to_string()),
source: source.get_str("source").ok().map(|s| s.to_string()),
status: source.get_str("status").unwrap_or("").to_string(),
tmdb_id: source.get_i64("tmdb_id").ok(),
tmdb_profile: source.get_str("tmdb_profile").ok().map(|s| s.to_string()),
metadata: doc_field_to_json(source, "metadata"),
created_at: source
.get_datetime("created_at")
.map(|d| d.to_chrono())
.ok(),
face_count: source.get_i64("face_count").unwrap_or(0),
},
target_identity: TargetIdentitySnapshot {
id: target.get_i64("id").unwrap_or(0),
uuid: target.get_str("uuid").unwrap_or("").to_string(),
name: target.get_str("name").unwrap_or("").to_string(),
metadata_before: doc_field_to_json(target, "metadata_before"),
metadata_after: target
.get("metadata_after")
.map(|b| b.clone().into_relaxed_extjson()),
},
aliases_added_to_target: parsed_aliases,
metadata_fields_added: parsed_fields,
faces_transferred: FacesTransferred {
file_uuid: faces.get_str("file_uuid").unwrap_or("").to_string(),
face_ids: faces
.get_array("face_ids")
.map(|arr| {
arr.iter()
.filter_map(|b| b.as_str().map(|s| s.to_string()))
.collect()
})
.unwrap_or_default(),
trace_ids: faces
.get_array("trace_ids")
.map(|arr| arr.iter().filter_map(|b| b.as_i32()).collect())
.unwrap_or_default(),
count: faces.get_i64("count").unwrap_or(0),
},
merge_params: MergeParams {
keep_history: merge_params_doc.get_bool("keep_history").unwrap_or(true),
cleared_stranger_id: merge_params_doc
.get_bool("cleared_stranger_id")
.unwrap_or(true),
performed_by_user: merge_params_doc
.get_str("performed_by_user")
.ok()
.map(|s| s.to_string()),
},
merged_at: doc
.get_datetime("merged_at")
.map(|d| d.to_chrono())
.unwrap_or_default(),
undo_deadline: doc
.get_datetime("undo_deadline")
.map(|d| d.to_chrono())
.unwrap_or_default(),
undone: doc.get_bool("undone").unwrap_or(false),
undone_at: doc.get_datetime("undone_at").map(|d| d.to_chrono()).ok(),
undone_by: doc.get_str("undone_by").ok().map(|s| s.to_string()),
undone_snapshot,
undo_expired: doc.get_bool("undo_expired").unwrap_or(false),
})
}
pub fn to_document(&self) -> Document {
let mut doc = doc! {
"merge_id": &self.merge_id,
"source_identity": {
"id": self.source_identity.id as i64,
"uuid": &self.source_identity.uuid,
"name": &self.source_identity.name,
"identity_type": self.source_identity.identity_type.as_deref(),
"source": self.source_identity.source.as_deref(),
"status": &self.source_identity.status,
"tmdb_id": self.source_identity.tmdb_id,
"tmdb_profile": self.source_identity.tmdb_profile.as_deref(),
"metadata": json_to_bson(&self.source_identity.metadata),
"created_at": self.source_identity.created_at
.map(|dt| BsonDateTime::from_chrono(dt)),
"face_count": self.source_identity.face_count,
},
"target_identity": {
"id": self.target_identity.id as i64,
"uuid": &self.target_identity.uuid,
"name": &self.target_identity.name,
"metadata_before": json_to_bson(&self.target_identity.metadata_before),
"metadata_after": self.target_identity.metadata_after.as_ref().map(json_to_bson),
},
"aliases_added_to_target": self.aliases_added_to_target.iter().map(|a| {
doc! {
"name": &a.name,
"locale": &a.locale,
"source": a.source.as_deref(),
}
}).collect::<Vec<Document>>(),
"metadata_fields_added": &self.metadata_fields_added,
"faces_transferred": {
"file_uuid": &self.faces_transferred.file_uuid,
"face_ids": &self.faces_transferred.face_ids,
"trace_ids": &self.faces_transferred.trace_ids,
"count": self.faces_transferred.count,
},
"merge_params": {
"keep_history": self.merge_params.keep_history,
"cleared_stranger_id": self.merge_params.cleared_stranger_id,
"performed_by_user": self.merge_params.performed_by_user.as_deref(),
},
"merged_at": BsonDateTime::from_chrono(self.merged_at),
"undo_deadline": BsonDateTime::from_chrono(self.undo_deadline),
"undone": self.undone,
"undone_at": self.undone_at.map(|dt| BsonDateTime::from_chrono(dt)),
"undone_by": self.undone_by.as_deref(),
"undone_snapshot": self.undone_snapshot.as_ref().map(|s| {
doc! {
"source_identity_id": s.source_identity_id,
"source_uuid": &s.source_uuid,
"source_name": &s.source_name,
"target_metadata_at_undo": json_to_bson(&s.target_metadata_at_undo),
}
}),
"undo_expired": self.undo_expired,
};
if let Some(ref oid) = self.id {
doc.insert("_id", oid.clone());
}
doc
}
}
#[derive(Clone)]
pub struct IdentityMergeHistoryStore {
client: Client,
db: Database,
collection: Collection<Document>,
}
impl IdentityMergeHistoryStore {
pub async fn init() -> Result<Self> {
let uri = crate::core::config::MONGODB_URL.as_str();
let client = Client::with_uri_str(uri)
.await
.context("Failed to connect to MongoDB")?;
let db_name = crate::core::config::MONGODB_DATABASE.as_str();
let db = client.database(db_name);
let collection: Collection<Document> = db.collection(COLLECTION_NAME);
let store = Self {
client,
db,
collection,
};
store.ensure_indexes().await?;
Ok(store)
}
async fn ensure_indexes(&self) -> Result<()> {
let merge_id_index = IndexModel::builder()
.keys(doc! { "merge_id": 1 })
.options(
mongodb::options::IndexOptions::builder()
.unique(true)
.build(),
)
.build();
let merged_at_index = IndexModel::builder().keys(doc! { "merged_at": -1 }).build();
let source_uuid_index = IndexModel::builder()
.keys(doc! { "source_identity.uuid": 1 })
.build();
let target_uuid_index = IndexModel::builder()
.keys(doc! { "target_identity.uuid": 1 })
.build();
self.collection
.create_indexes(
[
merge_id_index,
merged_at_index,
source_uuid_index,
target_uuid_index,
],
None,
)
.await
.context("Failed to create identity_merge_history indexes")?;
tracing::info!("MongoDB identity_merge_history indexes ensured");
Ok(())
}
pub fn generate_merge_id() -> String {
Uuid::new_v4().to_string()
}
pub async fn store_merge_history(&self, history: &IdentityMergeHistory) -> Result<()> {
let doc = history.to_document();
self.collection
.insert_one(doc, None)
.await
.context("Failed to store merge history in MongoDB")?;
tracing::info!(
"Stored merge history: merge_id={}, source={}, target={}, faces={}",
history.merge_id,
history.source_identity.name,
history.target_identity.name,
history.faces_transferred.count
);
Ok(())
}
pub async fn get_merge_history(&self, merge_id: &str) -> Result<Option<IdentityMergeHistory>> {
let filter = doc! { "merge_id": merge_id };
let result = self
.collection
.find_one(filter, None)
.await
.context("Failed to get merge history from MongoDB")?;
match result {
Some(doc) => {
let history = IdentityMergeHistory::from_document(&doc)
.context("Failed to parse merge history from MongoDB")?;
Ok(Some(history))
}
None => Ok(None),
}
}
pub async fn query_merge_history(
&self,
query: MergeHistoryQuery,
page: u32,
page_size: u32,
) -> Result<(Vec<MergeHistoryEntry>, u64)> {
let mut filter = doc! {};
if let Some(source_uuid) = query.source_uuid {
filter.insert("source_identity.uuid", source_uuid);
}
if let Some(target_uuid) = query.target_uuid {
filter.insert("target_identity.uuid", target_uuid);
}
if let Some(merge_id) = query.merge_id {
filter.insert("merge_id", merge_id);
}
if let Some(undone) = query.undone {
filter.insert("undone", undone);
}
let skip = (page - 1) * page_size;
let limit = page_size;
let mut cursor = self
.collection
.find(filter.clone(), None)
.await
.context("Failed to query merge history")?;
let total = self
.collection
.count_documents(filter, None)
.await
.context("Failed to count merge history")?;
let mut results: Vec<MergeHistoryEntry> = Vec::new();
let mut count = 0;
while cursor.advance().await.context("Failed to advance cursor")? {
if count >= skip && results.len() < limit as usize {
let doc: Document = cursor
.deserialize_current()
.context("Failed to deserialize")?;
let merge_id = doc.get_str("merge_id").unwrap_or("").to_string();
let source_name = doc
.get_document("source_identity")
.map(|d| d.get_str("name").unwrap_or("").to_string())
.unwrap_or_default();
let target_name = doc
.get_document("target_identity")
.map(|d| d.get_str("name").unwrap_or("").to_string())
.unwrap_or_default();
let faces_count = doc
.get_document("faces_transferred")
.map(|d| d.get_i64("count").unwrap_or(0))
.unwrap_or(0);
let merged_at = doc
.get_datetime("merged_at")
.map(|d| d.to_chrono())
.unwrap_or_default();
let undo_deadline = doc
.get_datetime("undo_deadline")
.map(|d| d.to_chrono())
.unwrap_or_default();
let undone = doc.get_bool("undone").unwrap_or(false);
let undo_expired = doc.get_bool("undo_expired").unwrap_or(false);
results.push(MergeHistoryEntry {
merge_id,
source_name,
target_name,
faces_transferred: faces_count,
merged_at,
undo_deadline,
undone,
undo_expired,
});
}
count += 1;
}
Ok((results, total))
}
pub async fn mark_as_undone(
&self,
merge_id: &str,
undone_by: Option<&str>,
undone_snapshot: UndoneSnapshot,
) -> Result<()> {
let filter = doc! { "merge_id": merge_id };
let snapshot_doc = doc! {
"source_identity_id": undone_snapshot.source_identity_id,
"source_uuid": &undone_snapshot.source_uuid,
"source_name": &undone_snapshot.source_name,
"target_metadata_at_undo": json_to_bson(&undone_snapshot.target_metadata_at_undo),
};
let update = doc! {
"$set": {
"undone": true,
"undone_at": BsonDateTime::from_chrono(Utc::now()),
"undone_by": undone_by,
"undone_snapshot": snapshot_doc,
}
};
self.collection
.update_one(filter, update, None)
.await
.context("Failed to mark merge as undone")?;
tracing::info!("Marked merge {} as undone", merge_id);
Ok(())
}
pub async fn mark_as_redone(&self, merge_id: &str, redone_by: Option<&str>) -> Result<()> {
let now = Utc::now();
let new_deadline = now + chrono::Duration::hours(24);
let filter = doc! { "merge_id": merge_id };
let update = doc! {
"$set": {
"undone": false,
"undone_at": bson::Bson::Null,
"undone_by": redone_by,
"undone_snapshot": bson::Bson::Null,
"undo_deadline": BsonDateTime::from_chrono(new_deadline),
"undo_expired": false
}
};
self.collection
.update_one(filter, update, None)
.await
.context("Failed to mark merge as redone")?;
tracing::info!(
"Marked merge {} as redone (new deadline: {})",
merge_id,
new_deadline
);
Ok(())
}
pub async fn check_undo_deadline(&self, merge_id: &str) -> Result<bool> {
let history = self
.get_merge_history(merge_id)
.await?
.context("Merge history not found")?;
let now = Utc::now();
if now > history.undo_deadline {
return Ok(false);
}
Ok(true)
}
pub async fn mark_expired_merges(&self) -> Result<u64> {
let now = BsonDateTime::from_chrono(Utc::now());
let filter = doc! {
"undo_deadline": { "$lt": now },
"undone": false,
"undo_expired": false
};
let update = doc! { "$set": { "undo_expired": true } };
let result = self
.collection
.update_many(filter, update, None)
.await
.context("Failed to mark expired merges")?;
let count = result.modified_count;
if count > 0 {
tracing::info!("Marked {} expired merges", count);
}
Ok(count)
}
}

View File

@@ -32,17 +32,21 @@ pub trait VectorStore: Send + Sync {
async fn search(&self, query_vector: &[f32], limit: usize) -> Result<Vec<SearchResult>>;
}
pub mod identity_merge_history;
pub mod mongodb_db;
pub mod postgres_db;
pub mod qdrant_db;
pub mod redis_client;
pub mod redis_db;
pub mod sync_db;
pub use identity_merge_history::{
AliasEntry, FacesTransferred, IdentityMergeHistory, IdentityMergeHistoryStore,
IdentitySnapshot, MergeHistoryEntry, MergeHistoryQuery, MergeParams, TargetIdentitySnapshot,
UndoneSnapshot,
};
pub use mongodb_db::MongoDb;
pub use postgres_db::{
Bm25Result, CandidateRecord, CreateApiKeyConfig, FileIdentityRecord, FileRecord,
HybridSearchResult, IdentityChunkRecord, IdentityDetailRecord, IdentityFaceRecord,
Bm25Result, CandidateRecord, CreateApiKeyConfig, FileFaceRecord, FileIdentityRecord,
FileRecord, HybridSearchResult, IdentityChunkRecord, IdentityDetailRecord, IdentityFaceRecord,
IdentityFileRecord, MonitorJob, MonitorJobStats, MonitorJobStatus, PipelineType, PostgresDb,
ProcessorJobStatus, ProcessorResult, ProcessorType, ResourceRecord, VideoRecord, VideoStatus,
};
@@ -52,4 +56,3 @@ pub use redis_client::{
ProgressMessage, RedisClient,
};
pub use redis_db::RedisDb;
pub use sync_db::SyncDb;

View File

@@ -131,7 +131,6 @@ impl MongoDb {
pre_chunk_ids: vec![],
parent_chunk_id: doc.parent_chunk_id,
child_chunk_ids: doc.child_chunk_ids,
visual_stats: None,
}
})
.collect();
@@ -190,7 +189,6 @@ impl MongoDb {
pre_chunk_ids: vec![],
parent_chunk_id: doc.parent_chunk_id,
child_chunk_ids: doc.child_chunk_ids,
visual_stats: None,
}
})
.collect();
@@ -246,7 +244,6 @@ impl MongoDb {
pre_chunk_ids: vec![],
parent_chunk_id: doc.parent_chunk_id,
child_chunk_ids: doc.child_chunk_ids,
visual_stats: None,
}
})
.collect();

View File

@@ -70,7 +70,7 @@ impl QdrantDb {
return Ok(());
}
let create_url = format!("{}/collections", self.base_url);
let create_url = format!("{}/collections/{}", self.base_url, self.collection_name);
let body = serde_json::json!({
"vectors": {
"size": vector_dim,
@@ -79,7 +79,7 @@ impl QdrantDb {
});
self.client
.post(&create_url)
.put(&create_url)
.header("api-key", &self.api_key)
.header("Content-Type", "application/json")
.json(&body)
@@ -867,50 +867,6 @@ impl VectorStore for QdrantDb {
}
}
/// Sync face embeddings from PostgreSQL to Qdrant for ANN search
pub async fn sync_face_embeddings(file_uuid: &str) -> Result<()> {
use crate::core::config::DATABASE_URL;
use sqlx::Row;
let pool = sqlx::PgPool::connect(&DATABASE_URL).await?;
let table = crate::core::db::schema::table_name("face_detections");
let qdrant: QdrantDb = QdrantDb::new();
let query = format!(
"SELECT id, trace_id, frame_number, embedding FROM {} \
WHERE file_uuid = $1 AND embedding IS NOT NULL \
AND ((metadata->>'qc_ok')::boolean IS NULL OR (metadata->>'qc_ok')::boolean = true)",
table
);
let rows = sqlx::query(&query).bind(file_uuid).fetch_all(&pool).await?;
let mut count = 0u64;
for row in &rows {
let id: i32 = row.get(0);
let trace_id: Option<i32> = row.get(1);
let frame_number: i64 = row.get(2);
let embedding: Option<Vec<f32>> = row.get(3);
if let (Some(emb), Some(tid)) = (embedding, trace_id) {
if let Err(e) = qdrant
.upsert_face_embedding(id as u64, &emb, file_uuid, tid, frame_number)
.await
{
tracing::warn!("Qdrant upsert failed for face {}: {}", id, e);
continue;
}
count += 1;
}
}
tracing::info!(
"Synced {} face embeddings to Qdrant for {}",
count,
file_uuid
);
Ok(())
}
pub async fn sync_trace_embeddings(file_uuid: &str) -> Result<()> {
use crate::core::config::DATABASE_URL;
use sqlx::Row;
@@ -984,12 +940,22 @@ pub async fn sync_trace_embeddings(file_uuid: &str) -> Result<()> {
}
// Push to Qdrant in batches
// Point ID: hash(file_uuid + trace_id) for global uniqueness
for chunk in trace_avgs.chunks(500) {
let batch: Vec<(u64, &[f32], Option<serde_json::Value>)> = chunk
.iter()
.map(|t| {
let point_id = {
use sha2::{Digest, Sha256};
let mut hasher = Sha256::new();
hasher.update(file_uuid.as_bytes());
hasher.update(b"_");
hasher.update(t.tid.to_string().as_bytes());
let hash = hasher.finalize();
u64::from_be_bytes(hash[0..8].try_into().unwrap())
};
(
t.tid as u64,
point_id,
t.avg_emb.as_slice(),
Some(serde_json::json!({
"trace_id": t.tid,

View File

@@ -319,7 +319,9 @@ impl RedisClient {
"timestamp": chrono::Utc::now().to_rfc3339(),
});
let _: usize = conn.publish(&channel, serde_json::to_string(&alert_json)?).await?;
let _: usize = conn
.publish(&channel, serde_json::to_string(&alert_json)?)
.await?;
tracing::warn!(
"Processor alert: {} | {} | {} | {}",

View File

@@ -78,7 +78,10 @@ impl SyncDb {
pub async fn embed_text(&self, text: &str) -> Result<Vec<f32>> {
let client = reqwest::Client::new();
let response = client
.post(&format!("{}/api/embeddings", crate::core::config::OLLAMA_URL.as_str()))
.post(&format!(
"{}/api/embeddings",
crate::core::config::OLLAMA_URL.as_str()
))
.json(&serde_json::json!({
"model": "all-minilm",
"prompt": text,