#!/opt/homebrew/bin/python3.11 """ Unified file probe — metadata extraction for all managed file types. Called by Rust unified_probe() via PythonExecutor. Output: JSON to stdout. """ import sys, json, os def probe_pdf(path): from PyPDF2 import PdfReader r = PdfReader(path) meta = {"pages": len(r.pages)} if r.metadata: if r.metadata.get('/Author'): meta["author"] = r.metadata['/Author'] if r.metadata.get('/Title'): meta["title"] = r.metadata['/Title'] if r.metadata.get('/Producer'): meta["producer"] = r.metadata['/Producer'] return meta def probe_docx(path): from docx import Document d = Document(path) meta = {"paragraphs": len(d.paragraphs), "sections": len(d.sections)} if d.core_properties.author: meta["author"] = d.core_properties.author return meta def probe_xlsx(path): import openpyxl wb = openpyxl.load_workbook(path, read_only=True) meta = {"sheet_names": wb.sheetnames, "sheet_count": len(wb.sheetnames)} wb.close() return meta def probe_pptx(path): from pptx import Presentation prs = Presentation(path) meta = {"slide_count": len(prs.slides)} texts = [] for slide in prs.slides: for shape in slide.shapes: if shape.has_text_frame: t = shape.text_frame.text.strip() if t: texts.append(t) if texts: meta["text_preview"] = " | ".join(texts[:5]) return meta def probe_archive(path): import zipfile, tarfile name = os.path.basename(path) entries = [] if name.endswith('.zip'): with zipfile.ZipFile(path) as z: for e in z.infolist(): entries.append({"name": e.filename, "size": e.file_size}) elif name.endswith('.tar') or '.tar.' in name: with tarfile.open(path) as t: for e in t.getmembers(): entries.append({"name": e.name, "size": e.size}) return {"entry_count": len(entries), "entries": entries[:50]} def probe_iwork(path): import zipfile try: with zipfile.ZipFile(path) as z: names = z.namelist() has_preview = "preview.pdf" in names or "preview.jpg" in names thumb = [n for n in names if "Thumbnail" in n or "preview" in n] return {"has_preview": has_preview, "preview_files": thumb} except Exception: return {"has_preview": False} def probe(path): ext = os.path.splitext(path)[1].lower() result = {"streams": []} if ext in ('.pdf',): result["document"] = probe_pdf(path) elif ext in ('.docx', '.doc'): result["document"] = probe_docx(path) elif ext in ('.xlsx', '.xls'): result["spreadsheet"] = probe_xlsx(path) elif ext in ('.pptx', '.ppt'): result["presentation"] = probe_pptx(path) elif ext in ('.pages', '.numbers', '.key'): result["apple_iwork"] = probe_iwork(path) elif ext in ('.zip', '.tar', '.gz', '.tgz', '.7z', '.rar'): result["archive"] = probe_archive(path) return json.dumps(result) if __name__ == '__main__': if len(sys.argv) < 2: print(json.dumps({"error": "Usage: probe_file.py "})) sys.exit(1) print(probe(sys.argv[1]))