feat: file dedup — content_hash SHA256 + /files/lookup API + auto-rename on name collision
This commit is contained in:
@@ -18,6 +18,34 @@ impl IngestionService {
|
||||
Self { db }
|
||||
}
|
||||
|
||||
/// Resolve name collision: if file_name exists with different content, append ` (N)` suffix.
|
||||
async fn resolve_filename(&self, file_name: &str, content_hash: &str) -> String {
|
||||
let table = schema::table_name("videos");
|
||||
let base = file_name.to_string();
|
||||
let dot_pos = base.rfind('.');
|
||||
let (stem, ext) = match dot_pos {
|
||||
Some(p) => (base[..p].to_string(), base[p..].to_string()),
|
||||
None => (base.clone(), String::new()),
|
||||
};
|
||||
let mut candidate = base.clone();
|
||||
let mut attempt = 0usize;
|
||||
loop {
|
||||
let conflict: Option<String> = sqlx::query_scalar(
|
||||
&format!("SELECT file_uuid FROM {} WHERE file_name = $1 AND (content_hash IS DISTINCT FROM $2 OR content_hash IS NULL)", table)
|
||||
)
|
||||
.bind(&candidate)
|
||||
.bind(content_hash)
|
||||
.fetch_optional(self.db.pool())
|
||||
.await
|
||||
.unwrap_or(None);
|
||||
if conflict.is_none() {
|
||||
return candidate;
|
||||
}
|
||||
attempt += 1;
|
||||
candidate = format!("{} ({}){}", stem, attempt, ext);
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn ingest(&self, file_path: &str) -> Result<Option<String>> {
|
||||
let path = Path::new(file_path);
|
||||
|
||||
@@ -32,16 +60,33 @@ impl IngestionService {
|
||||
.to_string_lossy()
|
||||
.to_string();
|
||||
|
||||
// Stable UUID based on MAC + Birthday + Filename.
|
||||
// Moving the file (path change) keeps the SAME identity.
|
||||
// 1. Compute SHA256 for dedup
|
||||
let content_hash = crate::core::storage::content_hash::compute_sha256(&canonical_path).ok().unwrap_or_default();
|
||||
|
||||
// 1. Look for existing Birthday (Identity Anchor)
|
||||
// If the file (by name) was registered before, use its original birth time.
|
||||
// 2. Hash check — same content = already registered
|
||||
let videos_table = schema::table_name("videos");
|
||||
if !content_hash.is_empty() {
|
||||
if let Ok(Some(existing_uuid)) = sqlx::query_scalar::<_, String>(
|
||||
&format!("SELECT file_uuid FROM {} WHERE content_hash = $1 LIMIT 1", videos_table)
|
||||
)
|
||||
.bind(&content_hash)
|
||||
.fetch_optional(self.db.pool())
|
||||
.await
|
||||
{
|
||||
info!("Content already registered: {} ({})", filename, existing_uuid);
|
||||
return Ok(Some(existing_uuid));
|
||||
}
|
||||
}
|
||||
|
||||
// 3. Resolve name conflict (same name, different content → auto-rename)
|
||||
let final_name = self.resolve_filename(&filename, &content_hash).await;
|
||||
|
||||
// 4. Compute UUID with resolved name
|
||||
let videos_table = schema::table_name("videos");
|
||||
let birthday = sqlx::query_scalar::<_, chrono::DateTime<chrono::Utc>>(
|
||||
&format!("SELECT registration_time FROM {} WHERE file_name = $1 AND registration_time IS NOT NULL LIMIT 1", videos_table)
|
||||
)
|
||||
.bind(&filename)
|
||||
.bind(&final_name)
|
||||
.fetch_optional(self.db.pool())
|
||||
.await
|
||||
.ok()
|
||||
@@ -54,30 +99,15 @@ impl IngestionService {
|
||||
.map(|p| p.to_string_lossy().to_string())
|
||||
.unwrap_or_default();
|
||||
|
||||
// 2. Compute UUID
|
||||
let uuid = uuid_utils::compute_birth_uuid(
|
||||
&uuid_utils::get_mac_address(),
|
||||
&birthday,
|
||||
&canonical_path.to_string_lossy(),
|
||||
&filename,
|
||||
&final_name,
|
||||
);
|
||||
|
||||
let parent = canonical_path
|
||||
.parent()
|
||||
.map(|p| p.to_string_lossy().to_string())
|
||||
.unwrap_or_default();
|
||||
|
||||
let username = uuid_utils::extract_username_from_path(&parent);
|
||||
|
||||
if let Ok(Some(_)) = self.db.get_video_by_uuid(&uuid).await {
|
||||
info!(
|
||||
"Video already registered: {} ({})",
|
||||
path.file_name().unwrap_or_default().to_string_lossy(),
|
||||
uuid
|
||||
);
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
info!("Starting ingestion for: {} ({})", path.display(), uuid);
|
||||
|
||||
let probe_result = probe::probe_video(file_path)
|
||||
@@ -195,6 +225,16 @@ impl IngestionService {
|
||||
.await
|
||||
.with_context(|| "Failed to register video in database")?;
|
||||
|
||||
// Store content_hash for dedup
|
||||
if !content_hash.is_empty() {
|
||||
let vt = schema::table_name("videos");
|
||||
let _ = sqlx::query(&format!("UPDATE {} SET content_hash = $1 WHERE file_uuid = $2", vt))
|
||||
.bind(&content_hash)
|
||||
.bind(&uuid)
|
||||
.execute(self.db.pool())
|
||||
.await;
|
||||
}
|
||||
|
||||
self.db
|
||||
.set_registration_time(&uuid)
|
||||
.await
|
||||
@@ -205,10 +245,6 @@ impl IngestionService {
|
||||
.await
|
||||
.with_context(|| "Failed to set birth_registration")?;
|
||||
|
||||
info!(
|
||||
"Successfully registered video: {} (UUID: {}, Birth UUID: {})",
|
||||
record.file_name, uuid, uuid
|
||||
);
|
||||
Ok(Some(uuid))
|
||||
}
|
||||
}
|
||||
|
||||
18
src/core/storage/content_hash.rs
Normal file
18
src/core/storage/content_hash.rs
Normal file
@@ -0,0 +1,18 @@
|
||||
use sha2::{Digest, Sha256};
|
||||
use std::io::Read;
|
||||
use std::path::Path;
|
||||
use anyhow::Result;
|
||||
|
||||
/// Compute SHA256 of the entire file content
|
||||
pub fn compute_sha256(path: &Path) -> Result<String> {
|
||||
let mut file = std::fs::File::open(path)?;
|
||||
let mut hasher = Sha256::new();
|
||||
let mut buf = [0u8; 65536];
|
||||
loop {
|
||||
let n = file.read(&mut buf)?;
|
||||
if n == 0 { break; }
|
||||
hasher.update(&buf[..n]);
|
||||
}
|
||||
let hash = format!("{:x}", hasher.finalize());
|
||||
Ok(hash)
|
||||
}
|
||||
@@ -1,7 +1,9 @@
|
||||
pub mod content_hash;
|
||||
pub mod file_manager;
|
||||
pub mod output_dir;
|
||||
pub mod uuid;
|
||||
|
||||
pub use content_hash::compute_sha256;
|
||||
pub use file_manager::FileManager;
|
||||
pub use output_dir::OutputDir;
|
||||
pub use uuid::compute_uuid;
|
||||
|
||||
Reference in New Issue
Block a user