From 0e73d2a2ce60417871c519ba15a855c1ded732c4 Mon Sep 17 00:00:00 2001 From: Accusys Date: Fri, 15 May 2026 14:58:44 +0800 Subject: [PATCH] test: add unified probe unit tests (8 Rust + 6 Python), fix pre-existing test compilation errors --- scripts/test_probe_file.py | 92 +++++++++++++++++++++++++++++++++++++ src/api/universal_search.rs | 2 + src/core/db/postgres_db.rs | 9 ++-- src/core/probe/unified.rs | 69 ++++++++++++++++++++++++++++ 4 files changed, 169 insertions(+), 3 deletions(-) create mode 100644 scripts/test_probe_file.py diff --git a/scripts/test_probe_file.py b/scripts/test_probe_file.py new file mode 100644 index 0000000..25088c4 --- /dev/null +++ b/scripts/test_probe_file.py @@ -0,0 +1,92 @@ +#!/opt/homebrew/bin/python3.11 +"""Unit tests for probe_file.py""" + +import sys +import json +import os +import unittest + +sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) +import probe_file # noqa: E402 + + +class TestProbePDF(unittest.TestCase): + def setUp(self): + self.path = "/tmp/_test_probe.pdf" + if not os.path.exists(self.path): + from PyPDF2 import PdfWriter + w = PdfWriter() + w.add_blank_page(210, 297) + with open(self.path, 'wb') as f: + w.write(f) + + def test_pdf_has_pages(self): + result = json.loads(probe_file.probe(self.path)) + self.assertIn("document", result) + self.assertGreater(result["document"]["pages"], 0) + + def test_pdf_has_streams(self): + result = json.loads(probe_file.probe(self.path)) + self.assertEqual(result["streams"], []) + + +class TestProbeDOCX(unittest.TestCase): + def setUp(self): + self.path = "/tmp/_test_probe.docx" + if not os.path.exists(self.path): + from docx import Document + d = Document() + d.add_paragraph("Test paragraph.") + d.save(self.path) + + def test_docx_has_paragraphs(self): + result = json.loads(probe_file.probe(self.path)) + self.assertIn("document", result) + self.assertGreater(result["document"]["paragraphs"], 0) + + +class TestProbeXLSX(unittest.TestCase): + def setUp(self): + self.path = "/tmp/_test_probe.xlsx" + if not os.path.exists(self.path): + import openpyxl + wb = openpyxl.Workbook() + ws = wb.active + ws.title = "Sheet1" + ws.append(["A", 1]) + wb.save(self.path) + + def test_xlsx_has_sheets(self): + result = json.loads(probe_file.probe(self.path)) + self.assertIn("spreadsheet", result) + self.assertGreater(result["spreadsheet"]["sheet_count"], 0) + + +class TestProbePPTX(unittest.TestCase): + def setUp(self): + self.path = "/tmp/_test_probe.pptx" + if not os.path.exists(self.path): + from pptx import Presentation + prs = Presentation() + slide = prs.slides.add_slide(prs.slide_layouts[0]) + slide.shapes.title.text = "Test" + prs.save(self.path) + + def test_pptx_has_slides(self): + result = json.loads(probe_file.probe(self.path)) + self.assertIn("presentation", result) + self.assertGreater(result["presentation"]["slide_count"], 0) + + +class TestProbeUnknown(unittest.TestCase): + def test_unknown_extension(self): + path = "/tmp/_test_probe.xyz" + with open(path, 'w') as f: + f.write("test") + result = json.loads(probe_file.probe(path)) + os.remove(path) + self.assertEqual(result["streams"], []) + + +if __name__ == "__main__": + unittest.main() diff --git a/src/api/universal_search.rs b/src/api/universal_search.rs index acdeb90..532460a 100644 --- a/src/api/universal_search.rs +++ b/src/api/universal_search.rs @@ -821,6 +821,8 @@ mod tests { ocr_text: None, has_face: None, speaker_id: None, + chunk_type: None, + co_appears_with_trace_id: None, min_confidence: Some(0.8), min_unique_classes: Some(3), min_spatial_density: Some(0.5), diff --git a/src/core/db/postgres_db.rs b/src/core/db/postgres_db.rs index cda0c32..6115ac4 100644 --- a/src/core/db/postgres_db.rs +++ b/src/core/db/postgres_db.rs @@ -4912,16 +4912,19 @@ mod tests { started_at: Some( chrono::DateTime::parse_from_rfc3339("2024-01-01T10:00:00Z") .unwrap() - .with_timezone(&chrono::Utc), + .with_timezone(&chrono::Utc) + .to_rfc3339(), ), updated_at: Some( chrono::DateTime::parse_from_rfc3339("2024-01-01T10:05:00Z") .unwrap() - .with_timezone(&chrono::Utc), + .with_timezone(&chrono::Utc) + .to_rfc3339(), ), created_at: chrono::DateTime::parse_from_rfc3339("2024-01-01T09:55:00Z") .unwrap() - .into(), + .with_timezone(&chrono::Utc) + .to_rfc3339(), processors: vec!["asr".to_string(), "cut".to_string()], completed_processors: vec!["asr".to_string()], failed_processors: vec![], diff --git a/src/core/probe/unified.rs b/src/core/probe/unified.rs index 4316da4..3d9804d 100644 --- a/src/core/probe/unified.rs +++ b/src/core/probe/unified.rs @@ -1,6 +1,75 @@ use std::path::Path; use std::time::SystemTime; +#[cfg(test)] +mod tests { + use super::*; + use std::fs; + + #[test] + fn test_detect_category_video() { + assert_eq!(detect_category(Path::new("video.mp4")), FileCategory::Video); + assert_eq!(detect_category(Path::new("video.mov")), FileCategory::Video); + assert_eq!(detect_category(Path::new("video.mkv")), FileCategory::Video); + assert_eq!(detect_category(Path::new("video.avi")), FileCategory::Video); + } + + #[test] + fn test_detect_category_image() { + assert_eq!(detect_category(Path::new("photo.jpg")), FileCategory::Image); + assert_eq!(detect_category(Path::new("photo.jpeg")), FileCategory::Image); + assert_eq!(detect_category(Path::new("photo.png")), FileCategory::Image); + assert_eq!(detect_category(Path::new("photo.svg")), FileCategory::Image); + assert_eq!(detect_category(Path::new("photo.webp")), FileCategory::Image); + } + + #[test] + fn test_detect_category_document() { + assert_eq!(detect_category(Path::new("doc.pdf")), FileCategory::Document); + assert_eq!(detect_category(Path::new("doc.docx")), FileCategory::Document); + assert_eq!(detect_category(Path::new("doc.pages")), FileCategory::Document); + assert_eq!(detect_category(Path::new("doc.txt")), FileCategory::Document); + } + + #[test] + fn test_detect_category_spreadsheet() { + assert_eq!(detect_category(Path::new("data.xlsx")), FileCategory::Spreadsheet); + assert_eq!(detect_category(Path::new("data.csv")), FileCategory::Spreadsheet); + assert_eq!(detect_category(Path::new("data.numbers")), FileCategory::Spreadsheet); + } + + #[test] + fn test_detect_category_presentation() { + assert_eq!(detect_category(Path::new("deck.pptx")), FileCategory::Presentation); + assert_eq!(detect_category(Path::new("deck.key")), FileCategory::Presentation); + } + + #[test] + fn test_detect_category_archive() { + assert_eq!(detect_category(Path::new("files.zip")), FileCategory::Archive); + assert_eq!(detect_category(Path::new("files.tar.gz")), FileCategory::Archive); + } + + #[test] + fn test_detect_category_unknown() { + assert_eq!(detect_category(Path::new("file.xyz")), FileCategory::Unknown); + assert_eq!(detect_category(Path::new("file")), FileCategory::Unknown); + } + + #[test] + fn test_base_format_info() { + // Create a temp file and verify base_format_info returns correct fields + let tmp = std::env::temp_dir().join("_test_unified_probe.txt"); + fs::write(&tmp, b"hello probe").unwrap(); + let info = base_format_info(&tmp); + assert_eq!(info["file_type"], "document"); + assert_eq!(info["format_name"], "txt"); + assert!(!info["size"].as_str().unwrap_or("").is_empty()); + assert!(!info["mtime"].as_str().unwrap_or("").is_empty()); + let _ = fs::remove_file(&tmp); + } +} + /// File category derived from extension #[derive(Debug, Clone, PartialEq)] pub enum FileCategory {