diff --git a/data/auth.sqlite b/data/auth.sqlite index 614ae18..b82db65 100644 Binary files a/data/auth.sqlite and b/data/auth.sqlite differ diff --git a/data/users/demo.sqlite b/data/users/demo.sqlite index 4d54f59..28ce523 100644 Binary files a/data/users/demo.sqlite and b/data/users/demo.sqlite differ diff --git a/data/users/warren.sqlite b/data/users/warren.sqlite index ca78efd..dfee91e 100644 Binary files a/data/users/warren.sqlite and b/data/users/warren.sqlite differ diff --git a/src/lib.rs b/src/lib.rs index a826eee..d47d338 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -5,5 +5,6 @@ pub mod config; pub mod filetree; pub mod pg_client; pub mod render; +pub mod scan; pub mod server; pub mod sync; diff --git a/src/main.rs b/src/main.rs index 44f382c..8119d2c 100644 --- a/src/main.rs +++ b/src/main.rs @@ -28,6 +28,33 @@ enum Commands { #[command(subcommand)] action: ConfigCommands, }, + /// Scan and import files from directory + Scan { + /// User ID + #[arg(short, long)] + user: String, + /// Directory to scan + #[arg(short, long)] + dir: String, + /// Batch size for database insertion + #[arg(short, long, default_value = "100")] + batch: usize, + /// Skip SHA256 hash calculation (faster import) + #[arg(short, long, default_value = "true")] + skip_hash: bool, + /// Number of threads for hash calculation (if skip_hash=false) + #[arg(short, long, default_value = "4")] + threads: usize, + }, + /// Compute SHA256 hashes for imported files + Hash { + /// User ID + #[arg(short, long)] + user: String, + /// Number of threads for parallel hash calculation + #[arg(short, long, default_value = "4")] + threads: usize, + }, } #[derive(Subcommand)] @@ -73,6 +100,17 @@ async fn main() -> anyhow::Result<()> { Commands::Config { action } => { handle_config_command(action)?; } + Commands::Scan { user, dir, batch, skip_hash, threads } => { + use markbase::scan::ScanOptions; + let options = ScanOptions { + skip_hash, + threads, + }; + markbase::scan::scan_directory(&user, &dir, batch, options)?; + } + Commands::Hash { user, threads } => { + markbase::scan::compute_hashes(&user, threads)?; + } } Ok(()) } diff --git a/src/scan.rs b/src/scan.rs new file mode 100644 index 0000000..e329816 --- /dev/null +++ b/src/scan.rs @@ -0,0 +1,510 @@ +use anyhow::{Context, Result}; +use rusqlite::Connection; +use sha2::{Digest, Sha256}; +use std::collections::HashMap; +use std::fs; +use std::path::Path; +use std::sync::{Arc, Mutex}; +use std::thread; +use std::time::Instant; + +use crate::filetree::node::{Aliases, FileNode, NodeType}; +use crate::filetree::FileTree; + +pub struct ScanOptions { + pub skip_hash: bool, + pub threads: usize, +} + +impl Default for ScanOptions { + fn default() -> Self { + ScanOptions { + skip_hash: true, + threads: 4, + } + } +} + +pub fn scan_directory(user_id: &str, dir: &str, batch_size: usize, options: ScanOptions) -> Result<()> { + let start = Instant::now(); + let dir_path = Path::new(dir); + + if !dir_path.exists() { + anyhow::bail!("Directory not found: {}", dir); + } + + println!("=== File Scan Performance Test ==="); + println!("User ID: {}", user_id); + println!("Directory: {}", dir); + println!("Batch size: {}", batch_size); + println!("Skip hash: {}", options.skip_hash); + if !options.skip_hash { + println!("Hash threads: {}", options.threads); + } + println!(); + + println!("[1/4] Scanning directory structure..."); + let scan_start = Instant::now(); + + let mut folders: Vec<(String, String, Option)> = Vec::new(); + let mut files: Vec<(String, String, u64, String)> = Vec::new(); + + scan_recursive(dir_path, dir_path, &mut folders, &mut files)?; + + let scan_duration = scan_start.elapsed(); + println!(" Scanned {} folders, {} files in {:.2}s", + folders.len(), files.len(), scan_duration.as_secs_f64()); + + println!(); + println!("[2/4] Generating node IDs..."); + let id_start = Instant::now(); + + let mac = get_mac_address()?; + + let mut folder_nodes: Vec = Vec::new(); + let mut file_nodes: Vec = Vec::new(); + let mut file_info: Vec<(String, String)> = Vec::new(); + + for (path_str, label, parent_id) in &folders { + let mtime = fs::metadata(path_str) + .and_then(|m| m.modified()) + .unwrap_or(std::time::SystemTime::UNIX_EPOCH); + let mtime_secs = mtime.duration_since(std::time::SystemTime::UNIX_EPOCH) + .unwrap_or_default() + .as_secs(); + + let node_id = generate_uuid(path_str, label, &mac, mtime_secs); + + folder_nodes.push(FileNode { + node_id, + label: label.clone(), + aliases: Aliases::empty(), + file_uuid: None, + sha256: None, + parent_id: parent_id.clone(), + children: Vec::new(), + node_type: NodeType::Folder, + icon: Some("📁".to_string()), + color: None, + bg_color: None, + file_size: None, + registered_at: None, + created_at: chrono::Utc::now().timestamp().to_string(), + updated_at: chrono::Utc::now().timestamp().to_string(), + sort_order: 0, + }); + } + + for (path_str, filename, size, _ext) in &files { + let mtime = fs::metadata(path_str) + .and_then(|m| m.modified()) + .unwrap_or(std::time::SystemTime::UNIX_EPOCH); + let mtime_secs = mtime.duration_since(std::time::SystemTime::UNIX_EPOCH) + .unwrap_or_default() + .as_secs(); + + let node_id = generate_uuid(path_str, filename, &mac, mtime_secs); + + file_info.push((node_id.clone(), path_str.clone())); + + file_nodes.push(FileNode { + node_id, + label: filename.clone(), + aliases: { + let mut aliases = Aliases::empty(); + aliases.set("path", path_str); + aliases + }, + file_uuid: None, + sha256: None, + parent_id: find_parent_folder(path_str, dir_path, &folders), + children: Vec::new(), + node_type: NodeType::File, + icon: get_file_icon(filename), + color: None, + bg_color: None, + file_size: Some(*size as i64), + registered_at: Some(chrono::Utc::now().timestamp().to_string()), + created_at: chrono::Utc::now().timestamp().to_string(), + updated_at: chrono::Utc::now().timestamp().to_string(), + sort_order: 0, + }); + } + + let id_duration = id_start.elapsed(); + println!(" Generated {} folder IDs, {} file IDs in {:.2}s", + folder_nodes.len(), file_nodes.len(), id_duration.as_secs_f64()); + + println!(); + println!("[3/4] Opening database..."); + let db_start = Instant::now(); + + let db_path = FileTree::user_db_path(user_id); + if !Path::new(&db_path).exists() { + FileTree::init_user_db(user_id)?; + } + + let conn = FileTree::open_user_db(user_id) + .with_context(|| format!("Failed to open database for user {}", user_id))?; + + let db_duration = db_start.elapsed(); + println!(" Database opened in {:.2}s", db_duration.as_secs_f64()); + + println!(); + println!("[4/4] Inserting nodes (batch size: {})...", batch_size); + let insert_start = Instant::now(); + + let tx = conn.unchecked_transaction()?; + + let folder_count = folder_nodes.len(); + let file_count = file_nodes.len(); + let total_nodes = folder_count + file_count; + let mut inserted = 0; + + for node in folder_nodes { + insert_node(&conn, &node)?; + inserted += 1; + + if inserted % batch_size == 0 { + print!("\r Inserted {}/{} nodes...", inserted, total_nodes); + use std::io::Write; + std::io::stdout().flush().ok(); + } + } + + for node in file_nodes { + insert_node(&conn, &node)?; + inserted += 1; + + if inserted % batch_size == 0 { + print!("\r Inserted {}/{} nodes...", inserted, total_nodes); + use std::io::Write; + std::io::stdout().flush().ok(); + } + } + + tx.commit()?; + + let insert_duration = insert_start.elapsed(); + println!("\r Inserted {} nodes in {:.2}s ({:.0} nodes/sec)", + total_nodes, + insert_duration.as_secs_f64(), + total_nodes as f64 / insert_duration.as_secs_f64()); + + let total_duration = start.elapsed(); + println!(); + println!("=== Summary ==="); + println!("Total time: {:.2}s", total_duration.as_secs_f64()); + println!("Folders: {}", folder_count); + println!("Files: {}", file_count); + println!("Total nodes: {}", total_nodes); + println!("Database: {}", FileTree::user_db_path(user_id)); + println!(); + println!("Performance breakdown:"); + println!(" - Scanning: {:.2}s ({:.0}%)", + scan_duration.as_secs_f64(), + scan_duration.as_secs_f64() / total_duration.as_secs_f64() * 100.0); + println!(" - ID gen: {:.2}s ({:.0}%)", + id_duration.as_secs_f64(), + id_duration.as_secs_f64() / total_duration.as_secs_f64() * 100.0); + println!(" - DB open: {:.2}s ({:.0}%)", + db_duration.as_secs_f64(), + db_duration.as_secs_f64() / total_duration.as_secs_f64() * 100.0); + println!(" - Insertion: {:.2}s ({:.0}%)", + insert_duration.as_secs_f64(), + insert_duration.as_secs_f64() / total_duration.as_secs_f64() * 100.0); + + if !options.skip_hash { + println!(); + println!("=== Starting background hash calculation ==="); + println!("Files to hash: {}", file_info.len()); + println!("Threads: {}", options.threads); + + let file_count = file_info.len(); + let hash_start = Instant::now(); + compute_hashes_parallel(user_id, file_info, options.threads)?; + + let hash_duration = hash_start.elapsed(); + println!(); + println!("Hash calculation completed in {:.2}s ({:.0} files/sec)", + hash_duration.as_secs_f64(), + file_count as f64 / hash_duration.as_secs_f64()); + } else { + println!(); + println!("â„šī¸ SHA256 hashes skipped. Run 'markbase hash --user {}' to compute hashes.", user_id); + } + + Ok(()) +} + +pub fn compute_hashes(user_id: &str, threads: usize) -> Result<()> { + println!("=== Background Hash Calculation ==="); + println!("User ID: {}", user_id); + println!("Threads: {}", threads); + println!(); + + let conn = FileTree::open_user_db(user_id)?; + + let file_info: Vec<(String, String)> = conn + .prepare("SELECT node_id, aliases_json FROM file_nodes WHERE node_type = 'file' AND sha256 IS NULL")? + .query_map([], |row| { + let node_id: String = row.get(0)?; + let aliases_json: String = row.get(1)?; + let aliases: HashMap = serde_json::from_str(&aliases_json).unwrap_or_default(); + let path = aliases.get("path").cloned().unwrap_or_default(); + Ok((node_id, path)) + })? + .filter_map(|r| r.ok()) + .filter(|(_, path)| !path.is_empty()) + .collect(); + + if file_info.is_empty() { + println!("No files need hashing. All files already have SHA256."); + return Ok(()); + } + + println!("Files to hash: {}", file_info.len()); + + let file_count = file_info.len(); + let start = Instant::now(); + compute_hashes_parallel(user_id, file_info, threads)?; + + let duration = start.elapsed(); + println!(); + println!("Hash calculation completed in {:.2}s ({:.0} files/sec)", + duration.as_secs_f64(), + file_count as f64 / duration.as_secs_f64()); + + Ok(()) +} + +fn compute_hashes_parallel(user_id: &str, file_info: Vec<(String, String)>, threads: usize) -> Result<()> { + let db_path = FileTree::user_db_path(user_id); + let user_id = user_id.to_string(); + let file_info = Arc::new(file_info); + let results: Arc>> = Arc::new(Mutex::new(HashMap::new())); + let processed: Arc> = Arc::new(Mutex::new(0)); + let total = file_info.len(); + + let mut handles = Vec::new(); + + for i in 0..threads { + let file_info = Arc::clone(&file_info); + let results = Arc::clone(&results); + let processed = Arc::clone(&processed); + let user_id = user_id.clone(); + + let handle = thread::spawn(move || { + let chunk_size = (file_info.len() / threads) + (if i < file_info.len() % threads { 1 } else { 0 }); + let start_idx = i * (file_info.len() / threads) + i.min(file_info.len() % threads); + let _end_idx = start_idx + chunk_size; + + for (node_id, path_str) in file_info.iter().skip(start_idx).take(chunk_size) { + if let Ok(hash) = compute_file_hash(path_str) { + results.lock().unwrap().insert(node_id.clone(), hash); + } + + let mut p = processed.lock().unwrap(); + *p += 1; + if *p % 100 == 0 { + print!("\r Hashed {}/{} files...", *p, total); + use std::io::Write; + std::io::stdout().flush().ok(); + } + } + }); + + handles.push(handle); + } + + for handle in handles { + handle.join().expect("Thread panicked"); + } + + println!("\r Hashed {}/{} files...Done", total, total); + + let results = results.lock().unwrap(); + let conn = Connection::open(&db_path)?; + + let tx = conn.unchecked_transaction()?; + + for (node_id, hash) in results.iter() { + conn.execute( + "UPDATE file_nodes SET sha256 = ?1, file_uuid = ?1, updated_at = ?2 WHERE node_id = ?3", + rusqlite::params![hash, chrono::Utc::now().timestamp().to_string(), node_id], + )?; + } + + tx.commit()?; + + println!(" Updated {} hashes in database", results.len()); + + Ok(()) +} + +fn scan_recursive( + base: &Path, + current: &Path, + folders: &mut Vec<(String, String, Option)>, + files: &mut Vec<(String, String, u64, String)>, +) -> Result<()> { + let entries: Vec<_> = fs::read_dir(current)? + .filter_map(|e| e.ok()) + .filter(|e| e.file_name() != ".DS_Store") + .collect(); + + for entry in entries { + let path = entry.path(); + let path_str = path.to_string_lossy().to_string(); + let filename = entry.file_name().to_string_lossy().to_string(); + + if path.is_dir() { + let parent_id = if path.parent() == Some(base) { + None + } else { + find_parent_folder_id(&path_str, folders) + }; + + folders.push((path_str.clone(), filename, parent_id)); + + scan_recursive(base, &path, folders, files)?; + } else { + let metadata = entry.metadata()?; + let size = metadata.len(); + let ext = path.extension() + .and_then(|s| s.to_str()) + .unwrap_or("") + .to_string(); + + files.push((path_str, filename, size, ext)); + } + } + + Ok(()) +} + +fn compute_file_hash(path: &str) -> Result { + let mut hasher = Sha256::new(); + let mut file = fs::File::open(path)?; + let mut buffer = [0u8; 8192]; + + loop { + let n = std::io::Read::read(&mut file, &mut buffer)?; + if n == 0 { + break; + } + hasher.update(&buffer[..n]); + } + + let hash = format!("{:x}", hasher.finalize()); + Ok(hash.chars().take(32).collect()) +} + +fn generate_uuid(path: &str, filename: &str, mac: &str, mtime: u64) -> String { + let mut hasher = Sha256::new(); + hasher.update(path.as_bytes()); + hasher.update(filename.as_bytes()); + hasher.update(mac.as_bytes()); + hasher.update(mtime.to_string().as_bytes()); + format!("{:x}", hasher.finalize()).chars().take(32).collect() +} + +fn get_mac_address() -> Result { + let output = std::process::Command::new("ifconfig") + .arg("en0") + .output()?; + + let stdout = String::from_utf8_lossy(&output.stdout); + for line in stdout.lines() { + if line.contains("ether") { + if let Some(mac) = line.split_whitespace().nth(1) { + return Ok(mac.to_string()); + } + } + } + + Ok("00:00:00:00:00:00".to_string()) +} + +fn find_parent_folder( + file_path: &str, + _base: &Path, + folders: &[(String, String, Option)], +) -> Option { + let file_dir = Path::new(file_path).parent()?; + + for (folder_path, _, folder_id) in folders { + if Path::new(folder_path) == file_dir { + return folder_id.clone(); + } + } + + None +} + +fn find_parent_folder_id(path: &str, folders: &[(String, String, Option)]) -> Option { + let current = Path::new(path); + let parent = current.parent()?; + let parent_str = parent.to_string_lossy(); + + for (folder_path, _, folder_id) in folders { + if folder_path == &parent_str { + return folder_id.clone(); + } + } + + None +} + +fn insert_node(conn: &Connection, node: &FileNode) -> Result<()> { + conn.execute( + "INSERT OR REPLACE INTO file_nodes ( + node_id, label, aliases_json, file_uuid, sha256, parent_id, children_json, + node_type, icon, color, bg_color, file_size, registered_at, + created_at, updated_at, sort_order + ) VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, ?9, ?10, ?11, ?12, ?13, ?14, ?15, ?16)", + rusqlite::params![ + node.node_id, + node.label, + node.aliases.to_json(), + node.file_uuid, + node.sha256, + node.parent_id, + serde_json::to_string(&node.children)?, + node.node_type.as_str(), + node.icon, + node.color, + node.bg_color, + node.file_size, + node.registered_at, + node.created_at, + node.updated_at, + node.sort_order, + ], + )?; + + Ok(()) +} + +fn get_file_icon(filename: &str) -> Option { + let ext = Path::new(filename) + .extension() + .and_then(|s| s.to_str()) + .unwrap_or("") + .to_lowercase(); + + let icon = match ext.as_str() { + "mp4" | "mov" | "avi" | "mkv" | "webm" => "đŸŽŦ", + "jpg" | "jpeg" | "png" | "gif" | "webp" | "svg" => "đŸ–ŧī¸", + "pdf" => "📄", + "doc" | "docx" => "📝", + "xls" | "xlsx" => "📊", + "ppt" | "pptx" => "đŸ“Ŋī¸", + "zip" | "rar" | "7z" | "tar" | "gz" => "đŸ“Ļ", + "mp3" | "wav" | "flac" | "aac" => "đŸŽĩ", + "txt" | "md" => "📃", + _ => "📄", + }; + + Some(icon.to_string()) +} \ No newline at end of file diff --git a/src/server.rs b/src/server.rs index 2242eb9..43f4f47 100644 --- a/src/server.rs +++ b/src/server.rs @@ -831,6 +831,11 @@ async fn upload_file( // Save to database (user-specific SQLite) let db_path = crate::filetree::FileTree::user_db_path(&user_id); + let file_uuid_clone = file_uuid.clone(); + let file_hash_clone = file_hash.clone(); + let filename_clone = filename.clone(); + let file_path_clone = file_path.clone(); + let db_result = tokio::task::spawn_blocking(move || -> anyhow::Result<()> { let conn = crate::filetree::FileTree::open_user_db(&db_path)?; @@ -844,8 +849,8 @@ async fn upload_file( "INSERT INTO file_registry (file_uuid, sha256, file_size, mime_type, registered_at) VALUES (?1, ?2, ?3, ?4, ?5)", rusqlite::params![ - &file_uuid, - &file_hash, + &file_uuid_clone, + &file_hash_clone, file_size, "", // mime_type (optional) now @@ -856,20 +861,20 @@ async fn upload_file( conn.execute( "INSERT OR IGNORE INTO file_locations (file_uuid, location, created_at) VALUES (?1, ?2, ?3)", - rusqlite::params![&file_uuid, &file_path, now], + rusqlite::params![&file_uuid_clone, &file_path_clone, now], )?; - // Create file node - let node_id = format!("node-{}", uuid::Uuid::new_v4().to_string().replace('-', "")[0..8]); + let uuid_str = uuid::Uuid::new_v4().to_string().replace('-', ""); + let node_id = format!("node-{}", &uuid_str[0..8]); conn.execute( "INSERT INTO file_nodes (node_id, label, file_uuid, sha256, node_type, file_size, created_at, updated_at) VALUES (?1, ?2, ?3, ?4, 'file', ?5, ?6, ?7)", rusqlite::params![ &node_id, - &filename, - &file_uuid, - &file_hash, + &filename_clone, + &file_uuid_clone, + &file_hash_clone, file_size, now, now @@ -880,40 +885,6 @@ async fn upload_file( }) .await; - // Add to file tree - let sha_clone = file_hash.clone(); - let fname_clone = filename.clone(); - let fuuid_clone = file_uuid.clone(); - let fpath_clone = file_path.clone(); - - let db_result = tokio::task::spawn_blocking(move || -> anyhow::Result<()> { - let conn = FileTree::open_user_db("demo")?; - - let other_id: Option = conn - .query_row( - "SELECT node_id FROM file_nodes WHERE label = 'Other' AND node_type = 'folder' LIMIT 1", - [], - |row| row.get(0), - ) - .ok(); - - let nid = uuid::Uuid::new_v4().to_string(); - let now = chrono::Utc::now().format("%Y-%m-%dT%H:%M:%SZ").to_string(); - - conn.execute( - "INSERT INTO file_nodes (node_id, label, aliases_json, file_uuid, sha256, node_type, parent_id, file_size, created_at, updated_at) VALUES (?1, ?2, '{}', ?3, ?4, 'file', ?5, ?6, ?7, ?8)", - rusqlite::params![nid, fname_clone, fuuid_clone, sha_clone, other_id, file_size, now, now], - )?; - - conn.execute( - "INSERT OR IGNORE INTO file_locations (file_uuid, location, label) VALUES (?1, ?2, 'origin')", - rusqlite::params![fuuid_clone, fpath_clone], - )?; - - Ok(()) - }) - .await; - match db_result { Ok(Ok(())) => {} Ok(Err(e)) => {