diff --git a/docs_v1.0/DESIGN/FILE_PROBE_SOP_V1.0.md b/docs_v1.0/DESIGN/FILE_PROBE_SOP_V1.0.md new file mode 100644 index 0000000..46bba7c --- /dev/null +++ b/docs_v1.0/DESIGN/FILE_PROBE_SOP_V1.0.md @@ -0,0 +1,196 @@ +--- +document_type: "design_doc" +service: "MOMENTRY_CORE" +title: "File Probe SOP — Unified Metadata Extraction" +version: "V1.0" +date: "2026-05-15" +author: "M5" +status: "draft" +--- + +# File Probe SOP — Unified Metadata Extraction + +| Item | Value | +|------|-------| +| Scope | All managed file types (video, image, document, spreadsheet, presentation, archive) | +| Status | Draft | +| Key principle | Every file produces a `probe_json` metadata block via a unified procedure | + +## Overview + +Currently `ffprobe` handles video/image files; non-video files get minimal `fs::metadata()` fallback. +This SOP defines a unified probe for ALL file types. + +## Step 0: File Category Detection (extension → category) + +``` +video: mp4, mov, mkv, avi, webm, m4v, mpeg +image: jpg, jpeg, png, gif, bmp, webp, svg, heic, tiff +document: pdf, doc, docx, odt, pages, rtf, txt, md +spreadsheet: xls, xlsx, csv, ods, numbers +presentation: ppt, pptx, odp, key +archive: zip, tar, gz, 7z, rar +unknown: everything else +``` + +## Step 1: Probe by Category + +| Category | Tool | License | Source Build | Phase | +|----------|------|---------|:-----------:|:----:| +| video / image | `ffprobe` (macOS built-in) | Apple EULA | ✅ system | 1 | +| `.pdf` | `PyPDF2` | BSD-3-Clause | ✅ pure Python | 2 | +| `.docx` | `python-docx` | MIT | ✅ pure Python | 2 | +| `.xlsx` | `openpyxl` | MIT | ✅ from source | 2 | +| `.pptx` | `python-pptx` | MIT | ✅ pure Python | 2 | +| `.pages/.key/.numbers` | `unzip` → `preview.pdf` | Apple EULA | ✅ system | 1 | +| archive | `unzip -l` / `tar -tvf` | Apple EULA | ✅ system | 1 | +| unknown | `fs::metadata()` | N/A | ✅ no dep | 1 | + +## Step 2: Unified probe_json Format + +```json +{ + "format": { + "filename": "/data/demo/report.docx", + "format_name": "docx", + "file_type": "document", + "size": "245760", + "mtime": "2026-05-15T02:15:00Z" + }, + "streams": [], + "category_meta": { + "pages": 12, + "paragraphs": 87, + "author": "Warren", + "created_at": "2026-03-15T10:00:00Z" + } +} +``` + +### Category Metadata Fields + +| Category | Fields | Tool | +|----------|--------|------| +| video / image | `streams[].codec_type, width, height, fps` | ffprobe | +| document (pdf) | `pages, author, title, page_size` | PyPDF2 | +| document (docx) | `paragraphs, author, created_at` | python-docx | +| spreadsheet | `sheet_names, sheet_count, row_count` | openpyxl | +| presentation | `slide_count, speaker_notes` | python-pptx | +| archive | `entry_count, entries: [{name, size}]` | unzip / tar | +| unknown | (none) | fs::metadata | + +## Implementation + +### Rust: `src/core/probe/unified.rs` + +```rust +pub enum FileCategory { Video, Image, Document, Spreadsheet, Presentation, Archive, Unknown } + +pub fn detect_category(path: &Path) -> FileCategory { + let ext = path.extension().and_then(|e| e.to_str()).map(|e| e.to_lowercase()); + match ext.as_deref() { + Some("mp4"|"mov"|"mkv"|"avi"|"webm"|"m4v") => FileCategory::Video, + Some("jpg"|"jpeg"|"png"|"gif"|"bmp"|"webp"|"svg") => FileCategory::Image, + Some("pdf"|"doc"|"docx"|"odt"|"pages"|"rtf"|"txt"|"md") => FileCategory::Document, + Some("xls"|"xlsx"|"csv"|"ods"|"numbers") => FileCategory::Spreadsheet, + Some("ppt"|"pptx"|"odp"|"key") => FileCategory::Presentation, + Some("zip"|"tar"|"gz"|"7z"|"rar") => FileCategory::Archive, + _ => FileCategory::Unknown, + } +} + +pub async fn unified_probe(path: &Path, scripts_dir: &str, python_path: &str) -> serde_json::Value { + let category = detect_category(path); + let format_base = base_format_info(path); + + match category { + FileCategory::Video | FileCategory::Image => { + ffprobe_probe(path, format_base) + } + FileCategory::Document | FileCategory::Spreadsheet | FileCategory::Presentation => { + python_probe(path, category, scripts_dir, python_path, format_base) + } + _ => minimal_probe(path, format_base), + } +} +``` + +### Python: `scripts/probe_file.py` + +```python +#!/opt/homebrew/bin/python3.11 +"""Unified file probe dispatcher.""" +import sys, json, os + +def probe(path): + ext = os.path.splitext(path)[1].lower() + result = {"streams": []} + if ext == '.pdf': + from PyPDF2 import PdfReader + r = PdfReader(path) + result["category_meta"] = { + "pages": len(r.pages), + "author": r.metadata.get('/Author', '') if r.metadata else '', + } + elif ext in ('.docx',): + from docx import Document + d = Document(path) + result["category_meta"] = { + "paragraphs": len(d.paragraphs), + "author": d.core_properties.author or '', + } + elif ext in ('.xlsx',): + import openpyxl + wb = openpyxl.load_workbook(path, read_only=True) + result["category_meta"] = { + "sheet_names": wb.sheetnames, + "sheet_count": len(wb.sheetnames), + } + elif ext in ('.pptx',): + from pptx import Presentation + prs = Presentation(path) + result["category_meta"] = {"slide_count": len(prs.slides)} + return json.dumps(result) + +if __name__ == '__main__': + print(probe(sys.argv[1])) +``` + +### Integration Points + +| File | Change | +|------|--------| +| `src/core/probe/unified.rs` | **New** — unified probe dispatcher | +| `src/core/probe/mod.rs` | Add `pub mod unified;` | +| `scripts/probe_file.py` | **New** — Python probe per category | +| `src/api/server.rs` | Step 5: replace `probe_video()` with `unified_probe()` | +| `src/watcher/watcher.rs` | `pre_process_file()` uses `unified_probe()` | + +### Dependencies + +```bash +pip3 install PyPDF2 python-docx openpyxl python-pptx +``` + +| Package | Version | License | Dependencies | +|---------|---------|---------|-------------| +| `PyPDF2` | 3.0.1+ | BSD-3-Clause | pure Python | +| `python-docx` | 1.2.0+ | MIT | pure Python | +| `openpyxl` | 3.1.5+ | MIT | + Pillow + et_xmlfile | +| `python-pptx` | 1.0.2+ | MIT | + lxml + Pillow + XlsxWriter | + +All verified to build from source (`--no-binary :all:`). All dependencies are MIT/BSD/Apache-2.0 licensed — no GPL dependency chain. + +## Phase Plan + +| Phase | Tools | Install | Status | +|-------|-------|---------|--------| +| 1 | `fs::metadata()` fallback | Built-in | ✅ Done | +| 2 | Python packages (PyPDF2, docx, openpyxl, pptx) | `pip3 install` | ⬜ Ready | +| 3 | `pdfinfo` (poppler), `libreoffice` (optional) | `brew install` | ⬜ Optional | + +## Version History + +| Version | Date | Changes | +|---------|------|---------| +| V1.0 | 2026-05-15 | Initial — unified probe SOP for all file types |