docs: unified file probe SOP design — PyPDF2, python-docx, openpyxl, python-pptx
This commit is contained in:
196
docs_v1.0/DESIGN/FILE_PROBE_SOP_V1.0.md
Normal file
196
docs_v1.0/DESIGN/FILE_PROBE_SOP_V1.0.md
Normal file
@@ -0,0 +1,196 @@
|
|||||||
|
---
|
||||||
|
document_type: "design_doc"
|
||||||
|
service: "MOMENTRY_CORE"
|
||||||
|
title: "File Probe SOP — Unified Metadata Extraction"
|
||||||
|
version: "V1.0"
|
||||||
|
date: "2026-05-15"
|
||||||
|
author: "M5"
|
||||||
|
status: "draft"
|
||||||
|
---
|
||||||
|
|
||||||
|
# File Probe SOP — Unified Metadata Extraction
|
||||||
|
|
||||||
|
| Item | Value |
|
||||||
|
|------|-------|
|
||||||
|
| Scope | All managed file types (video, image, document, spreadsheet, presentation, archive) |
|
||||||
|
| Status | Draft |
|
||||||
|
| Key principle | Every file produces a `probe_json` metadata block via a unified procedure |
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
Currently `ffprobe` handles video/image files; non-video files get minimal `fs::metadata()` fallback.
|
||||||
|
This SOP defines a unified probe for ALL file types.
|
||||||
|
|
||||||
|
## Step 0: File Category Detection (extension → category)
|
||||||
|
|
||||||
|
```
|
||||||
|
video: mp4, mov, mkv, avi, webm, m4v, mpeg
|
||||||
|
image: jpg, jpeg, png, gif, bmp, webp, svg, heic, tiff
|
||||||
|
document: pdf, doc, docx, odt, pages, rtf, txt, md
|
||||||
|
spreadsheet: xls, xlsx, csv, ods, numbers
|
||||||
|
presentation: ppt, pptx, odp, key
|
||||||
|
archive: zip, tar, gz, 7z, rar
|
||||||
|
unknown: everything else
|
||||||
|
```
|
||||||
|
|
||||||
|
## Step 1: Probe by Category
|
||||||
|
|
||||||
|
| Category | Tool | License | Source Build | Phase |
|
||||||
|
|----------|------|---------|:-----------:|:----:|
|
||||||
|
| video / image | `ffprobe` (macOS built-in) | Apple EULA | ✅ system | 1 |
|
||||||
|
| `.pdf` | `PyPDF2` | BSD-3-Clause | ✅ pure Python | 2 |
|
||||||
|
| `.docx` | `python-docx` | MIT | ✅ pure Python | 2 |
|
||||||
|
| `.xlsx` | `openpyxl` | MIT | ✅ from source | 2 |
|
||||||
|
| `.pptx` | `python-pptx` | MIT | ✅ pure Python | 2 |
|
||||||
|
| `.pages/.key/.numbers` | `unzip` → `preview.pdf` | Apple EULA | ✅ system | 1 |
|
||||||
|
| archive | `unzip -l` / `tar -tvf` | Apple EULA | ✅ system | 1 |
|
||||||
|
| unknown | `fs::metadata()` | N/A | ✅ no dep | 1 |
|
||||||
|
|
||||||
|
## Step 2: Unified probe_json Format
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"format": {
|
||||||
|
"filename": "/data/demo/report.docx",
|
||||||
|
"format_name": "docx",
|
||||||
|
"file_type": "document",
|
||||||
|
"size": "245760",
|
||||||
|
"mtime": "2026-05-15T02:15:00Z"
|
||||||
|
},
|
||||||
|
"streams": [],
|
||||||
|
"category_meta": {
|
||||||
|
"pages": 12,
|
||||||
|
"paragraphs": 87,
|
||||||
|
"author": "Warren",
|
||||||
|
"created_at": "2026-03-15T10:00:00Z"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### Category Metadata Fields
|
||||||
|
|
||||||
|
| Category | Fields | Tool |
|
||||||
|
|----------|--------|------|
|
||||||
|
| video / image | `streams[].codec_type, width, height, fps` | ffprobe |
|
||||||
|
| document (pdf) | `pages, author, title, page_size` | PyPDF2 |
|
||||||
|
| document (docx) | `paragraphs, author, created_at` | python-docx |
|
||||||
|
| spreadsheet | `sheet_names, sheet_count, row_count` | openpyxl |
|
||||||
|
| presentation | `slide_count, speaker_notes` | python-pptx |
|
||||||
|
| archive | `entry_count, entries: [{name, size}]` | unzip / tar |
|
||||||
|
| unknown | (none) | fs::metadata |
|
||||||
|
|
||||||
|
## Implementation
|
||||||
|
|
||||||
|
### Rust: `src/core/probe/unified.rs`
|
||||||
|
|
||||||
|
```rust
|
||||||
|
pub enum FileCategory { Video, Image, Document, Spreadsheet, Presentation, Archive, Unknown }
|
||||||
|
|
||||||
|
pub fn detect_category(path: &Path) -> FileCategory {
|
||||||
|
let ext = path.extension().and_then(|e| e.to_str()).map(|e| e.to_lowercase());
|
||||||
|
match ext.as_deref() {
|
||||||
|
Some("mp4"|"mov"|"mkv"|"avi"|"webm"|"m4v") => FileCategory::Video,
|
||||||
|
Some("jpg"|"jpeg"|"png"|"gif"|"bmp"|"webp"|"svg") => FileCategory::Image,
|
||||||
|
Some("pdf"|"doc"|"docx"|"odt"|"pages"|"rtf"|"txt"|"md") => FileCategory::Document,
|
||||||
|
Some("xls"|"xlsx"|"csv"|"ods"|"numbers") => FileCategory::Spreadsheet,
|
||||||
|
Some("ppt"|"pptx"|"odp"|"key") => FileCategory::Presentation,
|
||||||
|
Some("zip"|"tar"|"gz"|"7z"|"rar") => FileCategory::Archive,
|
||||||
|
_ => FileCategory::Unknown,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn unified_probe(path: &Path, scripts_dir: &str, python_path: &str) -> serde_json::Value {
|
||||||
|
let category = detect_category(path);
|
||||||
|
let format_base = base_format_info(path);
|
||||||
|
|
||||||
|
match category {
|
||||||
|
FileCategory::Video | FileCategory::Image => {
|
||||||
|
ffprobe_probe(path, format_base)
|
||||||
|
}
|
||||||
|
FileCategory::Document | FileCategory::Spreadsheet | FileCategory::Presentation => {
|
||||||
|
python_probe(path, category, scripts_dir, python_path, format_base)
|
||||||
|
}
|
||||||
|
_ => minimal_probe(path, format_base),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### Python: `scripts/probe_file.py`
|
||||||
|
|
||||||
|
```python
|
||||||
|
#!/opt/homebrew/bin/python3.11
|
||||||
|
"""Unified file probe dispatcher."""
|
||||||
|
import sys, json, os
|
||||||
|
|
||||||
|
def probe(path):
|
||||||
|
ext = os.path.splitext(path)[1].lower()
|
||||||
|
result = {"streams": []}
|
||||||
|
if ext == '.pdf':
|
||||||
|
from PyPDF2 import PdfReader
|
||||||
|
r = PdfReader(path)
|
||||||
|
result["category_meta"] = {
|
||||||
|
"pages": len(r.pages),
|
||||||
|
"author": r.metadata.get('/Author', '') if r.metadata else '',
|
||||||
|
}
|
||||||
|
elif ext in ('.docx',):
|
||||||
|
from docx import Document
|
||||||
|
d = Document(path)
|
||||||
|
result["category_meta"] = {
|
||||||
|
"paragraphs": len(d.paragraphs),
|
||||||
|
"author": d.core_properties.author or '',
|
||||||
|
}
|
||||||
|
elif ext in ('.xlsx',):
|
||||||
|
import openpyxl
|
||||||
|
wb = openpyxl.load_workbook(path, read_only=True)
|
||||||
|
result["category_meta"] = {
|
||||||
|
"sheet_names": wb.sheetnames,
|
||||||
|
"sheet_count": len(wb.sheetnames),
|
||||||
|
}
|
||||||
|
elif ext in ('.pptx',):
|
||||||
|
from pptx import Presentation
|
||||||
|
prs = Presentation(path)
|
||||||
|
result["category_meta"] = {"slide_count": len(prs.slides)}
|
||||||
|
return json.dumps(result)
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
print(probe(sys.argv[1]))
|
||||||
|
```
|
||||||
|
|
||||||
|
### Integration Points
|
||||||
|
|
||||||
|
| File | Change |
|
||||||
|
|------|--------|
|
||||||
|
| `src/core/probe/unified.rs` | **New** — unified probe dispatcher |
|
||||||
|
| `src/core/probe/mod.rs` | Add `pub mod unified;` |
|
||||||
|
| `scripts/probe_file.py` | **New** — Python probe per category |
|
||||||
|
| `src/api/server.rs` | Step 5: replace `probe_video()` with `unified_probe()` |
|
||||||
|
| `src/watcher/watcher.rs` | `pre_process_file()` uses `unified_probe()` |
|
||||||
|
|
||||||
|
### Dependencies
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pip3 install PyPDF2 python-docx openpyxl python-pptx
|
||||||
|
```
|
||||||
|
|
||||||
|
| Package | Version | License | Dependencies |
|
||||||
|
|---------|---------|---------|-------------|
|
||||||
|
| `PyPDF2` | 3.0.1+ | BSD-3-Clause | pure Python |
|
||||||
|
| `python-docx` | 1.2.0+ | MIT | pure Python |
|
||||||
|
| `openpyxl` | 3.1.5+ | MIT | + Pillow + et_xmlfile |
|
||||||
|
| `python-pptx` | 1.0.2+ | MIT | + lxml + Pillow + XlsxWriter |
|
||||||
|
|
||||||
|
All verified to build from source (`--no-binary :all:`). All dependencies are MIT/BSD/Apache-2.0 licensed — no GPL dependency chain.
|
||||||
|
|
||||||
|
## Phase Plan
|
||||||
|
|
||||||
|
| Phase | Tools | Install | Status |
|
||||||
|
|-------|-------|---------|--------|
|
||||||
|
| 1 | `fs::metadata()` fallback | Built-in | ✅ Done |
|
||||||
|
| 2 | Python packages (PyPDF2, docx, openpyxl, pptx) | `pip3 install` | ⬜ Ready |
|
||||||
|
| 3 | `pdfinfo` (poppler), `libreoffice` (optional) | `brew install` | ⬜ Optional |
|
||||||
|
|
||||||
|
## Version History
|
||||||
|
|
||||||
|
| Version | Date | Changes |
|
||||||
|
|---------|------|---------|
|
||||||
|
| V1.0 | 2026-05-15 | Initial — unified probe SOP for all file types |
|
||||||
Reference in New Issue
Block a user