test: add unified probe unit tests (8 Rust + 6 Python), fix pre-existing test compilation errors
This commit is contained in:
92
scripts/test_probe_file.py
Normal file
92
scripts/test_probe_file.py
Normal file
@@ -0,0 +1,92 @@
|
||||
#!/opt/homebrew/bin/python3.11
|
||||
"""Unit tests for probe_file.py"""
|
||||
|
||||
import sys
|
||||
import json
|
||||
import os
|
||||
import unittest
|
||||
|
||||
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
||||
import probe_file # noqa: E402
|
||||
|
||||
|
||||
class TestProbePDF(unittest.TestCase):
|
||||
def setUp(self):
|
||||
self.path = "/tmp/_test_probe.pdf"
|
||||
if not os.path.exists(self.path):
|
||||
from PyPDF2 import PdfWriter
|
||||
w = PdfWriter()
|
||||
w.add_blank_page(210, 297)
|
||||
with open(self.path, 'wb') as f:
|
||||
w.write(f)
|
||||
|
||||
def test_pdf_has_pages(self):
|
||||
result = json.loads(probe_file.probe(self.path))
|
||||
self.assertIn("document", result)
|
||||
self.assertGreater(result["document"]["pages"], 0)
|
||||
|
||||
def test_pdf_has_streams(self):
|
||||
result = json.loads(probe_file.probe(self.path))
|
||||
self.assertEqual(result["streams"], [])
|
||||
|
||||
|
||||
class TestProbeDOCX(unittest.TestCase):
|
||||
def setUp(self):
|
||||
self.path = "/tmp/_test_probe.docx"
|
||||
if not os.path.exists(self.path):
|
||||
from docx import Document
|
||||
d = Document()
|
||||
d.add_paragraph("Test paragraph.")
|
||||
d.save(self.path)
|
||||
|
||||
def test_docx_has_paragraphs(self):
|
||||
result = json.loads(probe_file.probe(self.path))
|
||||
self.assertIn("document", result)
|
||||
self.assertGreater(result["document"]["paragraphs"], 0)
|
||||
|
||||
|
||||
class TestProbeXLSX(unittest.TestCase):
|
||||
def setUp(self):
|
||||
self.path = "/tmp/_test_probe.xlsx"
|
||||
if not os.path.exists(self.path):
|
||||
import openpyxl
|
||||
wb = openpyxl.Workbook()
|
||||
ws = wb.active
|
||||
ws.title = "Sheet1"
|
||||
ws.append(["A", 1])
|
||||
wb.save(self.path)
|
||||
|
||||
def test_xlsx_has_sheets(self):
|
||||
result = json.loads(probe_file.probe(self.path))
|
||||
self.assertIn("spreadsheet", result)
|
||||
self.assertGreater(result["spreadsheet"]["sheet_count"], 0)
|
||||
|
||||
|
||||
class TestProbePPTX(unittest.TestCase):
|
||||
def setUp(self):
|
||||
self.path = "/tmp/_test_probe.pptx"
|
||||
if not os.path.exists(self.path):
|
||||
from pptx import Presentation
|
||||
prs = Presentation()
|
||||
slide = prs.slides.add_slide(prs.slide_layouts[0])
|
||||
slide.shapes.title.text = "Test"
|
||||
prs.save(self.path)
|
||||
|
||||
def test_pptx_has_slides(self):
|
||||
result = json.loads(probe_file.probe(self.path))
|
||||
self.assertIn("presentation", result)
|
||||
self.assertGreater(result["presentation"]["slide_count"], 0)
|
||||
|
||||
|
||||
class TestProbeUnknown(unittest.TestCase):
|
||||
def test_unknown_extension(self):
|
||||
path = "/tmp/_test_probe.xyz"
|
||||
with open(path, 'w') as f:
|
||||
f.write("test")
|
||||
result = json.loads(probe_file.probe(path))
|
||||
os.remove(path)
|
||||
self.assertEqual(result["streams"], [])
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
@@ -821,6 +821,8 @@ mod tests {
|
||||
ocr_text: None,
|
||||
has_face: None,
|
||||
speaker_id: None,
|
||||
chunk_type: None,
|
||||
co_appears_with_trace_id: None,
|
||||
min_confidence: Some(0.8),
|
||||
min_unique_classes: Some(3),
|
||||
min_spatial_density: Some(0.5),
|
||||
|
||||
@@ -4912,16 +4912,19 @@ mod tests {
|
||||
started_at: Some(
|
||||
chrono::DateTime::parse_from_rfc3339("2024-01-01T10:00:00Z")
|
||||
.unwrap()
|
||||
.with_timezone(&chrono::Utc),
|
||||
.with_timezone(&chrono::Utc)
|
||||
.to_rfc3339(),
|
||||
),
|
||||
updated_at: Some(
|
||||
chrono::DateTime::parse_from_rfc3339("2024-01-01T10:05:00Z")
|
||||
.unwrap()
|
||||
.with_timezone(&chrono::Utc),
|
||||
.with_timezone(&chrono::Utc)
|
||||
.to_rfc3339(),
|
||||
),
|
||||
created_at: chrono::DateTime::parse_from_rfc3339("2024-01-01T09:55:00Z")
|
||||
.unwrap()
|
||||
.into(),
|
||||
.with_timezone(&chrono::Utc)
|
||||
.to_rfc3339(),
|
||||
processors: vec!["asr".to_string(), "cut".to_string()],
|
||||
completed_processors: vec!["asr".to_string()],
|
||||
failed_processors: vec![],
|
||||
|
||||
@@ -1,6 +1,75 @@
|
||||
use std::path::Path;
|
||||
use std::time::SystemTime;
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use std::fs;
|
||||
|
||||
#[test]
|
||||
fn test_detect_category_video() {
|
||||
assert_eq!(detect_category(Path::new("video.mp4")), FileCategory::Video);
|
||||
assert_eq!(detect_category(Path::new("video.mov")), FileCategory::Video);
|
||||
assert_eq!(detect_category(Path::new("video.mkv")), FileCategory::Video);
|
||||
assert_eq!(detect_category(Path::new("video.avi")), FileCategory::Video);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_detect_category_image() {
|
||||
assert_eq!(detect_category(Path::new("photo.jpg")), FileCategory::Image);
|
||||
assert_eq!(detect_category(Path::new("photo.jpeg")), FileCategory::Image);
|
||||
assert_eq!(detect_category(Path::new("photo.png")), FileCategory::Image);
|
||||
assert_eq!(detect_category(Path::new("photo.svg")), FileCategory::Image);
|
||||
assert_eq!(detect_category(Path::new("photo.webp")), FileCategory::Image);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_detect_category_document() {
|
||||
assert_eq!(detect_category(Path::new("doc.pdf")), FileCategory::Document);
|
||||
assert_eq!(detect_category(Path::new("doc.docx")), FileCategory::Document);
|
||||
assert_eq!(detect_category(Path::new("doc.pages")), FileCategory::Document);
|
||||
assert_eq!(detect_category(Path::new("doc.txt")), FileCategory::Document);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_detect_category_spreadsheet() {
|
||||
assert_eq!(detect_category(Path::new("data.xlsx")), FileCategory::Spreadsheet);
|
||||
assert_eq!(detect_category(Path::new("data.csv")), FileCategory::Spreadsheet);
|
||||
assert_eq!(detect_category(Path::new("data.numbers")), FileCategory::Spreadsheet);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_detect_category_presentation() {
|
||||
assert_eq!(detect_category(Path::new("deck.pptx")), FileCategory::Presentation);
|
||||
assert_eq!(detect_category(Path::new("deck.key")), FileCategory::Presentation);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_detect_category_archive() {
|
||||
assert_eq!(detect_category(Path::new("files.zip")), FileCategory::Archive);
|
||||
assert_eq!(detect_category(Path::new("files.tar.gz")), FileCategory::Archive);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_detect_category_unknown() {
|
||||
assert_eq!(detect_category(Path::new("file.xyz")), FileCategory::Unknown);
|
||||
assert_eq!(detect_category(Path::new("file")), FileCategory::Unknown);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_base_format_info() {
|
||||
// Create a temp file and verify base_format_info returns correct fields
|
||||
let tmp = std::env::temp_dir().join("_test_unified_probe.txt");
|
||||
fs::write(&tmp, b"hello probe").unwrap();
|
||||
let info = base_format_info(&tmp);
|
||||
assert_eq!(info["file_type"], "document");
|
||||
assert_eq!(info["format_name"], "txt");
|
||||
assert!(!info["size"].as_str().unwrap_or("").is_empty());
|
||||
assert!(!info["mtime"].as_str().unwrap_or("").is_empty());
|
||||
let _ = fs::remove_file(&tmp);
|
||||
}
|
||||
}
|
||||
|
||||
/// File category derived from extension
|
||||
#[derive(Debug, Clone, PartialEq)]
|
||||
pub enum FileCategory {
|
||||
|
||||
Reference in New Issue
Block a user