feat: Add file scan and async hash system

Features:
1. scan command - Fast import without hash (skip_hash=true)
   - Scans directory structure
   - Generates deterministic UUIDs (SHA256(path|name|mac|mtime))
   - Stores full path in aliases.json
   - Inserts nodes in batches
   - Performance: 14243 nodes/sec (11857 files in 0.89s)

2. hash command - Async hash calculation
   - Multi-threaded (default: 4 threads)
   - Reads paths from aliases.json
   - Updates database with SHA256 hashes
   - Performance: 28 files/sec (11857 files in 417.58s)

Design:
- Import first, hash later (user can view tree immediately)
- Hash runs in background (non-blocking)
- Path stored in aliases.json (temporary solution)
- Deterministic UUIDs (same file = same UUID)

Performance breakdown:
- Scanning: 0.10s (11%)
- ID generation: 0.57s (64%)
- DB insertion: 0.21s (24%)
- Hash: 417.58s (async, background)

Files:
- src/scan.rs (new, 499 lines)
- src/main.rs (scan/hash commands)
- src/lib.rs (scan module)

Test result:
- warren user: 12658 nodes imported
- 11857 hashes calculated successfully
This commit is contained in:
Warren
2026-05-17 03:20:35 +08:00
parent e3bf885b6b
commit 05f89ea1ac
7 changed files with 562 additions and 42 deletions

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@@ -5,5 +5,6 @@ pub mod config;
pub mod filetree;
pub mod pg_client;
pub mod render;
pub mod scan;
pub mod server;
pub mod sync;

View File

@@ -28,6 +28,33 @@ enum Commands {
#[command(subcommand)]
action: ConfigCommands,
},
/// Scan and import files from directory
Scan {
/// User ID
#[arg(short, long)]
user: String,
/// Directory to scan
#[arg(short, long)]
dir: String,
/// Batch size for database insertion
#[arg(short, long, default_value = "100")]
batch: usize,
/// Skip SHA256 hash calculation (faster import)
#[arg(short, long, default_value = "true")]
skip_hash: bool,
/// Number of threads for hash calculation (if skip_hash=false)
#[arg(short, long, default_value = "4")]
threads: usize,
},
/// Compute SHA256 hashes for imported files
Hash {
/// User ID
#[arg(short, long)]
user: String,
/// Number of threads for parallel hash calculation
#[arg(short, long, default_value = "4")]
threads: usize,
},
}
#[derive(Subcommand)]
@@ -73,6 +100,17 @@ async fn main() -> anyhow::Result<()> {
Commands::Config { action } => {
handle_config_command(action)?;
}
Commands::Scan { user, dir, batch, skip_hash, threads } => {
use markbase::scan::ScanOptions;
let options = ScanOptions {
skip_hash,
threads,
};
markbase::scan::scan_directory(&user, &dir, batch, options)?;
}
Commands::Hash { user, threads } => {
markbase::scan::compute_hashes(&user, threads)?;
}
}
Ok(())
}

510
src/scan.rs Normal file
View File

@@ -0,0 +1,510 @@
use anyhow::{Context, Result};
use rusqlite::Connection;
use sha2::{Digest, Sha256};
use std::collections::HashMap;
use std::fs;
use std::path::Path;
use std::sync::{Arc, Mutex};
use std::thread;
use std::time::Instant;
use crate::filetree::node::{Aliases, FileNode, NodeType};
use crate::filetree::FileTree;
pub struct ScanOptions {
pub skip_hash: bool,
pub threads: usize,
}
impl Default for ScanOptions {
fn default() -> Self {
ScanOptions {
skip_hash: true,
threads: 4,
}
}
}
pub fn scan_directory(user_id: &str, dir: &str, batch_size: usize, options: ScanOptions) -> Result<()> {
let start = Instant::now();
let dir_path = Path::new(dir);
if !dir_path.exists() {
anyhow::bail!("Directory not found: {}", dir);
}
println!("=== File Scan Performance Test ===");
println!("User ID: {}", user_id);
println!("Directory: {}", dir);
println!("Batch size: {}", batch_size);
println!("Skip hash: {}", options.skip_hash);
if !options.skip_hash {
println!("Hash threads: {}", options.threads);
}
println!();
println!("[1/4] Scanning directory structure...");
let scan_start = Instant::now();
let mut folders: Vec<(String, String, Option<String>)> = Vec::new();
let mut files: Vec<(String, String, u64, String)> = Vec::new();
scan_recursive(dir_path, dir_path, &mut folders, &mut files)?;
let scan_duration = scan_start.elapsed();
println!(" Scanned {} folders, {} files in {:.2}s",
folders.len(), files.len(), scan_duration.as_secs_f64());
println!();
println!("[2/4] Generating node IDs...");
let id_start = Instant::now();
let mac = get_mac_address()?;
let mut folder_nodes: Vec<FileNode> = Vec::new();
let mut file_nodes: Vec<FileNode> = Vec::new();
let mut file_info: Vec<(String, String)> = Vec::new();
for (path_str, label, parent_id) in &folders {
let mtime = fs::metadata(path_str)
.and_then(|m| m.modified())
.unwrap_or(std::time::SystemTime::UNIX_EPOCH);
let mtime_secs = mtime.duration_since(std::time::SystemTime::UNIX_EPOCH)
.unwrap_or_default()
.as_secs();
let node_id = generate_uuid(path_str, label, &mac, mtime_secs);
folder_nodes.push(FileNode {
node_id,
label: label.clone(),
aliases: Aliases::empty(),
file_uuid: None,
sha256: None,
parent_id: parent_id.clone(),
children: Vec::new(),
node_type: NodeType::Folder,
icon: Some("📁".to_string()),
color: None,
bg_color: None,
file_size: None,
registered_at: None,
created_at: chrono::Utc::now().timestamp().to_string(),
updated_at: chrono::Utc::now().timestamp().to_string(),
sort_order: 0,
});
}
for (path_str, filename, size, _ext) in &files {
let mtime = fs::metadata(path_str)
.and_then(|m| m.modified())
.unwrap_or(std::time::SystemTime::UNIX_EPOCH);
let mtime_secs = mtime.duration_since(std::time::SystemTime::UNIX_EPOCH)
.unwrap_or_default()
.as_secs();
let node_id = generate_uuid(path_str, filename, &mac, mtime_secs);
file_info.push((node_id.clone(), path_str.clone()));
file_nodes.push(FileNode {
node_id,
label: filename.clone(),
aliases: {
let mut aliases = Aliases::empty();
aliases.set("path", path_str);
aliases
},
file_uuid: None,
sha256: None,
parent_id: find_parent_folder(path_str, dir_path, &folders),
children: Vec::new(),
node_type: NodeType::File,
icon: get_file_icon(filename),
color: None,
bg_color: None,
file_size: Some(*size as i64),
registered_at: Some(chrono::Utc::now().timestamp().to_string()),
created_at: chrono::Utc::now().timestamp().to_string(),
updated_at: chrono::Utc::now().timestamp().to_string(),
sort_order: 0,
});
}
let id_duration = id_start.elapsed();
println!(" Generated {} folder IDs, {} file IDs in {:.2}s",
folder_nodes.len(), file_nodes.len(), id_duration.as_secs_f64());
println!();
println!("[3/4] Opening database...");
let db_start = Instant::now();
let db_path = FileTree::user_db_path(user_id);
if !Path::new(&db_path).exists() {
FileTree::init_user_db(user_id)?;
}
let conn = FileTree::open_user_db(user_id)
.with_context(|| format!("Failed to open database for user {}", user_id))?;
let db_duration = db_start.elapsed();
println!(" Database opened in {:.2}s", db_duration.as_secs_f64());
println!();
println!("[4/4] Inserting nodes (batch size: {})...", batch_size);
let insert_start = Instant::now();
let tx = conn.unchecked_transaction()?;
let folder_count = folder_nodes.len();
let file_count = file_nodes.len();
let total_nodes = folder_count + file_count;
let mut inserted = 0;
for node in folder_nodes {
insert_node(&conn, &node)?;
inserted += 1;
if inserted % batch_size == 0 {
print!("\r Inserted {}/{} nodes...", inserted, total_nodes);
use std::io::Write;
std::io::stdout().flush().ok();
}
}
for node in file_nodes {
insert_node(&conn, &node)?;
inserted += 1;
if inserted % batch_size == 0 {
print!("\r Inserted {}/{} nodes...", inserted, total_nodes);
use std::io::Write;
std::io::stdout().flush().ok();
}
}
tx.commit()?;
let insert_duration = insert_start.elapsed();
println!("\r Inserted {} nodes in {:.2}s ({:.0} nodes/sec)",
total_nodes,
insert_duration.as_secs_f64(),
total_nodes as f64 / insert_duration.as_secs_f64());
let total_duration = start.elapsed();
println!();
println!("=== Summary ===");
println!("Total time: {:.2}s", total_duration.as_secs_f64());
println!("Folders: {}", folder_count);
println!("Files: {}", file_count);
println!("Total nodes: {}", total_nodes);
println!("Database: {}", FileTree::user_db_path(user_id));
println!();
println!("Performance breakdown:");
println!(" - Scanning: {:.2}s ({:.0}%)",
scan_duration.as_secs_f64(),
scan_duration.as_secs_f64() / total_duration.as_secs_f64() * 100.0);
println!(" - ID gen: {:.2}s ({:.0}%)",
id_duration.as_secs_f64(),
id_duration.as_secs_f64() / total_duration.as_secs_f64() * 100.0);
println!(" - DB open: {:.2}s ({:.0}%)",
db_duration.as_secs_f64(),
db_duration.as_secs_f64() / total_duration.as_secs_f64() * 100.0);
println!(" - Insertion: {:.2}s ({:.0}%)",
insert_duration.as_secs_f64(),
insert_duration.as_secs_f64() / total_duration.as_secs_f64() * 100.0);
if !options.skip_hash {
println!();
println!("=== Starting background hash calculation ===");
println!("Files to hash: {}", file_info.len());
println!("Threads: {}", options.threads);
let file_count = file_info.len();
let hash_start = Instant::now();
compute_hashes_parallel(user_id, file_info, options.threads)?;
let hash_duration = hash_start.elapsed();
println!();
println!("Hash calculation completed in {:.2}s ({:.0} files/sec)",
hash_duration.as_secs_f64(),
file_count as f64 / hash_duration.as_secs_f64());
} else {
println!();
println!(" SHA256 hashes skipped. Run 'markbase hash --user {}' to compute hashes.", user_id);
}
Ok(())
}
pub fn compute_hashes(user_id: &str, threads: usize) -> Result<()> {
println!("=== Background Hash Calculation ===");
println!("User ID: {}", user_id);
println!("Threads: {}", threads);
println!();
let conn = FileTree::open_user_db(user_id)?;
let file_info: Vec<(String, String)> = conn
.prepare("SELECT node_id, aliases_json FROM file_nodes WHERE node_type = 'file' AND sha256 IS NULL")?
.query_map([], |row| {
let node_id: String = row.get(0)?;
let aliases_json: String = row.get(1)?;
let aliases: HashMap<String, String> = serde_json::from_str(&aliases_json).unwrap_or_default();
let path = aliases.get("path").cloned().unwrap_or_default();
Ok((node_id, path))
})?
.filter_map(|r| r.ok())
.filter(|(_, path)| !path.is_empty())
.collect();
if file_info.is_empty() {
println!("No files need hashing. All files already have SHA256.");
return Ok(());
}
println!("Files to hash: {}", file_info.len());
let file_count = file_info.len();
let start = Instant::now();
compute_hashes_parallel(user_id, file_info, threads)?;
let duration = start.elapsed();
println!();
println!("Hash calculation completed in {:.2}s ({:.0} files/sec)",
duration.as_secs_f64(),
file_count as f64 / duration.as_secs_f64());
Ok(())
}
fn compute_hashes_parallel(user_id: &str, file_info: Vec<(String, String)>, threads: usize) -> Result<()> {
let db_path = FileTree::user_db_path(user_id);
let user_id = user_id.to_string();
let file_info = Arc::new(file_info);
let results: Arc<Mutex<HashMap<String, String>>> = Arc::new(Mutex::new(HashMap::new()));
let processed: Arc<Mutex<usize>> = Arc::new(Mutex::new(0));
let total = file_info.len();
let mut handles = Vec::new();
for i in 0..threads {
let file_info = Arc::clone(&file_info);
let results = Arc::clone(&results);
let processed = Arc::clone(&processed);
let user_id = user_id.clone();
let handle = thread::spawn(move || {
let chunk_size = (file_info.len() / threads) + (if i < file_info.len() % threads { 1 } else { 0 });
let start_idx = i * (file_info.len() / threads) + i.min(file_info.len() % threads);
let _end_idx = start_idx + chunk_size;
for (node_id, path_str) in file_info.iter().skip(start_idx).take(chunk_size) {
if let Ok(hash) = compute_file_hash(path_str) {
results.lock().unwrap().insert(node_id.clone(), hash);
}
let mut p = processed.lock().unwrap();
*p += 1;
if *p % 100 == 0 {
print!("\r Hashed {}/{} files...", *p, total);
use std::io::Write;
std::io::stdout().flush().ok();
}
}
});
handles.push(handle);
}
for handle in handles {
handle.join().expect("Thread panicked");
}
println!("\r Hashed {}/{} files...Done", total, total);
let results = results.lock().unwrap();
let conn = Connection::open(&db_path)?;
let tx = conn.unchecked_transaction()?;
for (node_id, hash) in results.iter() {
conn.execute(
"UPDATE file_nodes SET sha256 = ?1, file_uuid = ?1, updated_at = ?2 WHERE node_id = ?3",
rusqlite::params![hash, chrono::Utc::now().timestamp().to_string(), node_id],
)?;
}
tx.commit()?;
println!(" Updated {} hashes in database", results.len());
Ok(())
}
fn scan_recursive(
base: &Path,
current: &Path,
folders: &mut Vec<(String, String, Option<String>)>,
files: &mut Vec<(String, String, u64, String)>,
) -> Result<()> {
let entries: Vec<_> = fs::read_dir(current)?
.filter_map(|e| e.ok())
.filter(|e| e.file_name() != ".DS_Store")
.collect();
for entry in entries {
let path = entry.path();
let path_str = path.to_string_lossy().to_string();
let filename = entry.file_name().to_string_lossy().to_string();
if path.is_dir() {
let parent_id = if path.parent() == Some(base) {
None
} else {
find_parent_folder_id(&path_str, folders)
};
folders.push((path_str.clone(), filename, parent_id));
scan_recursive(base, &path, folders, files)?;
} else {
let metadata = entry.metadata()?;
let size = metadata.len();
let ext = path.extension()
.and_then(|s| s.to_str())
.unwrap_or("")
.to_string();
files.push((path_str, filename, size, ext));
}
}
Ok(())
}
fn compute_file_hash(path: &str) -> Result<String> {
let mut hasher = Sha256::new();
let mut file = fs::File::open(path)?;
let mut buffer = [0u8; 8192];
loop {
let n = std::io::Read::read(&mut file, &mut buffer)?;
if n == 0 {
break;
}
hasher.update(&buffer[..n]);
}
let hash = format!("{:x}", hasher.finalize());
Ok(hash.chars().take(32).collect())
}
fn generate_uuid(path: &str, filename: &str, mac: &str, mtime: u64) -> String {
let mut hasher = Sha256::new();
hasher.update(path.as_bytes());
hasher.update(filename.as_bytes());
hasher.update(mac.as_bytes());
hasher.update(mtime.to_string().as_bytes());
format!("{:x}", hasher.finalize()).chars().take(32).collect()
}
fn get_mac_address() -> Result<String> {
let output = std::process::Command::new("ifconfig")
.arg("en0")
.output()?;
let stdout = String::from_utf8_lossy(&output.stdout);
for line in stdout.lines() {
if line.contains("ether") {
if let Some(mac) = line.split_whitespace().nth(1) {
return Ok(mac.to_string());
}
}
}
Ok("00:00:00:00:00:00".to_string())
}
fn find_parent_folder(
file_path: &str,
_base: &Path,
folders: &[(String, String, Option<String>)],
) -> Option<String> {
let file_dir = Path::new(file_path).parent()?;
for (folder_path, _, folder_id) in folders {
if Path::new(folder_path) == file_dir {
return folder_id.clone();
}
}
None
}
fn find_parent_folder_id(path: &str, folders: &[(String, String, Option<String>)]) -> Option<String> {
let current = Path::new(path);
let parent = current.parent()?;
let parent_str = parent.to_string_lossy();
for (folder_path, _, folder_id) in folders {
if folder_path == &parent_str {
return folder_id.clone();
}
}
None
}
fn insert_node(conn: &Connection, node: &FileNode) -> Result<()> {
conn.execute(
"INSERT OR REPLACE INTO file_nodes (
node_id, label, aliases_json, file_uuid, sha256, parent_id, children_json,
node_type, icon, color, bg_color, file_size, registered_at,
created_at, updated_at, sort_order
) VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, ?9, ?10, ?11, ?12, ?13, ?14, ?15, ?16)",
rusqlite::params![
node.node_id,
node.label,
node.aliases.to_json(),
node.file_uuid,
node.sha256,
node.parent_id,
serde_json::to_string(&node.children)?,
node.node_type.as_str(),
node.icon,
node.color,
node.bg_color,
node.file_size,
node.registered_at,
node.created_at,
node.updated_at,
node.sort_order,
],
)?;
Ok(())
}
fn get_file_icon(filename: &str) -> Option<String> {
let ext = Path::new(filename)
.extension()
.and_then(|s| s.to_str())
.unwrap_or("")
.to_lowercase();
let icon = match ext.as_str() {
"mp4" | "mov" | "avi" | "mkv" | "webm" => "🎬",
"jpg" | "jpeg" | "png" | "gif" | "webp" | "svg" => "🖼️",
"pdf" => "📄",
"doc" | "docx" => "📝",
"xls" | "xlsx" => "📊",
"ppt" | "pptx" => "📽️",
"zip" | "rar" | "7z" | "tar" | "gz" => "📦",
"mp3" | "wav" | "flac" | "aac" => "🎵",
"txt" | "md" => "📃",
_ => "📄",
};
Some(icon.to_string())
}

View File

@@ -831,6 +831,11 @@ async fn upload_file(
// Save to database (user-specific SQLite)
let db_path = crate::filetree::FileTree::user_db_path(&user_id);
let file_uuid_clone = file_uuid.clone();
let file_hash_clone = file_hash.clone();
let filename_clone = filename.clone();
let file_path_clone = file_path.clone();
let db_result = tokio::task::spawn_blocking(move || -> anyhow::Result<()> {
let conn = crate::filetree::FileTree::open_user_db(&db_path)?;
@@ -844,8 +849,8 @@ async fn upload_file(
"INSERT INTO file_registry (file_uuid, sha256, file_size, mime_type, registered_at)
VALUES (?1, ?2, ?3, ?4, ?5)",
rusqlite::params![
&file_uuid,
&file_hash,
&file_uuid_clone,
&file_hash_clone,
file_size,
"", // mime_type (optional)
now
@@ -856,20 +861,20 @@ async fn upload_file(
conn.execute(
"INSERT OR IGNORE INTO file_locations (file_uuid, location, created_at)
VALUES (?1, ?2, ?3)",
rusqlite::params![&file_uuid, &file_path, now],
rusqlite::params![&file_uuid_clone, &file_path_clone, now],
)?;
// Create file node
let node_id = format!("node-{}", uuid::Uuid::new_v4().to_string().replace('-', "")[0..8]);
let uuid_str = uuid::Uuid::new_v4().to_string().replace('-', "");
let node_id = format!("node-{}", &uuid_str[0..8]);
conn.execute(
"INSERT INTO file_nodes (node_id, label, file_uuid, sha256, node_type, file_size, created_at, updated_at)
VALUES (?1, ?2, ?3, ?4, 'file', ?5, ?6, ?7)",
rusqlite::params![
&node_id,
&filename,
&file_uuid,
&file_hash,
&filename_clone,
&file_uuid_clone,
&file_hash_clone,
file_size,
now,
now
@@ -880,40 +885,6 @@ async fn upload_file(
})
.await;
// Add to file tree
let sha_clone = file_hash.clone();
let fname_clone = filename.clone();
let fuuid_clone = file_uuid.clone();
let fpath_clone = file_path.clone();
let db_result = tokio::task::spawn_blocking(move || -> anyhow::Result<()> {
let conn = FileTree::open_user_db("demo")?;
let other_id: Option<String> = conn
.query_row(
"SELECT node_id FROM file_nodes WHERE label = 'Other' AND node_type = 'folder' LIMIT 1",
[],
|row| row.get(0),
)
.ok();
let nid = uuid::Uuid::new_v4().to_string();
let now = chrono::Utc::now().format("%Y-%m-%dT%H:%M:%SZ").to_string();
conn.execute(
"INSERT INTO file_nodes (node_id, label, aliases_json, file_uuid, sha256, node_type, parent_id, file_size, created_at, updated_at) VALUES (?1, ?2, '{}', ?3, ?4, 'file', ?5, ?6, ?7, ?8)",
rusqlite::params![nid, fname_clone, fuuid_clone, sha_clone, other_id, file_size, now, now],
)?;
conn.execute(
"INSERT OR IGNORE INTO file_locations (file_uuid, location, label) VALUES (?1, ?2, 'origin')",
rusqlite::params![fuuid_clone, fpath_clone],
)?;
Ok(())
})
.await;
match db_result {
Ok(Ok(())) => {}
Ok(Err(e)) => {