feat: file dedup — content_hash SHA256 + /files/lookup API + auto-rename on name collision

This commit is contained in:
Accusys
2026-05-14 20:24:21 +08:00
parent 189bec929a
commit 4d1fe2d26f
5 changed files with 260 additions and 51 deletions

View File

@@ -18,6 +18,34 @@ impl IngestionService {
Self { db }
}
/// Resolve name collision: if file_name exists with different content, append ` (N)` suffix.
async fn resolve_filename(&self, file_name: &str, content_hash: &str) -> String {
let table = schema::table_name("videos");
let base = file_name.to_string();
let dot_pos = base.rfind('.');
let (stem, ext) = match dot_pos {
Some(p) => (base[..p].to_string(), base[p..].to_string()),
None => (base.clone(), String::new()),
};
let mut candidate = base.clone();
let mut attempt = 0usize;
loop {
let conflict: Option<String> = sqlx::query_scalar(
&format!("SELECT file_uuid FROM {} WHERE file_name = $1 AND (content_hash IS DISTINCT FROM $2 OR content_hash IS NULL)", table)
)
.bind(&candidate)
.bind(content_hash)
.fetch_optional(self.db.pool())
.await
.unwrap_or(None);
if conflict.is_none() {
return candidate;
}
attempt += 1;
candidate = format!("{} ({}){}", stem, attempt, ext);
}
}
pub async fn ingest(&self, file_path: &str) -> Result<Option<String>> {
let path = Path::new(file_path);
@@ -32,16 +60,33 @@ impl IngestionService {
.to_string_lossy()
.to_string();
// Stable UUID based on MAC + Birthday + Filename.
// Moving the file (path change) keeps the SAME identity.
// 1. Compute SHA256 for dedup
let content_hash = crate::core::storage::content_hash::compute_sha256(&canonical_path).ok().unwrap_or_default();
// 1. Look for existing Birthday (Identity Anchor)
// If the file (by name) was registered before, use its original birth time.
// 2. Hash check — same content = already registered
let videos_table = schema::table_name("videos");
if !content_hash.is_empty() {
if let Ok(Some(existing_uuid)) = sqlx::query_scalar::<_, String>(
&format!("SELECT file_uuid FROM {} WHERE content_hash = $1 LIMIT 1", videos_table)
)
.bind(&content_hash)
.fetch_optional(self.db.pool())
.await
{
info!("Content already registered: {} ({})", filename, existing_uuid);
return Ok(Some(existing_uuid));
}
}
// 3. Resolve name conflict (same name, different content → auto-rename)
let final_name = self.resolve_filename(&filename, &content_hash).await;
// 4. Compute UUID with resolved name
let videos_table = schema::table_name("videos");
let birthday = sqlx::query_scalar::<_, chrono::DateTime<chrono::Utc>>(
&format!("SELECT registration_time FROM {} WHERE file_name = $1 AND registration_time IS NOT NULL LIMIT 1", videos_table)
)
.bind(&filename)
.bind(&final_name)
.fetch_optional(self.db.pool())
.await
.ok()
@@ -54,30 +99,15 @@ impl IngestionService {
.map(|p| p.to_string_lossy().to_string())
.unwrap_or_default();
// 2. Compute UUID
let uuid = uuid_utils::compute_birth_uuid(
&uuid_utils::get_mac_address(),
&birthday,
&canonical_path.to_string_lossy(),
&filename,
&final_name,
);
let parent = canonical_path
.parent()
.map(|p| p.to_string_lossy().to_string())
.unwrap_or_default();
let username = uuid_utils::extract_username_from_path(&parent);
if let Ok(Some(_)) = self.db.get_video_by_uuid(&uuid).await {
info!(
"Video already registered: {} ({})",
path.file_name().unwrap_or_default().to_string_lossy(),
uuid
);
return Ok(None);
}
info!("Starting ingestion for: {} ({})", path.display(), uuid);
let probe_result = probe::probe_video(file_path)
@@ -195,6 +225,16 @@ impl IngestionService {
.await
.with_context(|| "Failed to register video in database")?;
// Store content_hash for dedup
if !content_hash.is_empty() {
let vt = schema::table_name("videos");
let _ = sqlx::query(&format!("UPDATE {} SET content_hash = $1 WHERE file_uuid = $2", vt))
.bind(&content_hash)
.bind(&uuid)
.execute(self.db.pool())
.await;
}
self.db
.set_registration_time(&uuid)
.await
@@ -205,10 +245,6 @@ impl IngestionService {
.await
.with_context(|| "Failed to set birth_registration")?;
info!(
"Successfully registered video: {} (UUID: {}, Birth UUID: {})",
record.file_name, uuid, uuid
);
Ok(Some(uuid))
}
}

View File

@@ -0,0 +1,18 @@
use sha2::{Digest, Sha256};
use std::io::Read;
use std::path::Path;
use anyhow::Result;
/// Compute SHA256 of the entire file content
pub fn compute_sha256(path: &Path) -> Result<String> {
let mut file = std::fs::File::open(path)?;
let mut hasher = Sha256::new();
let mut buf = [0u8; 65536];
loop {
let n = file.read(&mut buf)?;
if n == 0 { break; }
hasher.update(&buf[..n]);
}
let hash = format!("{:x}", hasher.finalize());
Ok(hash)
}

View File

@@ -1,7 +1,9 @@
pub mod content_hash;
pub mod file_manager;
pub mod output_dir;
pub mod uuid;
pub use content_hash::compute_sha256;
pub use file_manager::FileManager;
pub use output_dir::OutputDir;
pub use uuid::compute_uuid;