feat: Initial v0.9 release with API Key authentication
## v0.9.20260325_144654 ### Features - API Key Authentication System - Job Worker System - V2 Backup Versioning ### Bug Fixes - get_processor_results_by_job column mapping Co-authored-by: OpenCode
This commit is contained in:
@@ -1,5 +1,21 @@
|
||||
# Video Chunk 切分規範
|
||||
|
||||
| 項目 | 內容 |
|
||||
|------|------|
|
||||
| 建立者 | Warren |
|
||||
| 建立時間 | 2026-03-16 |
|
||||
| 文件版本 | V1.0 |
|
||||
|
||||
---
|
||||
|
||||
## 版本歷史
|
||||
|
||||
| 版本 | 日期 | 目的 | 操作人 | 工具/模型 |
|
||||
|------|------|------|--------|-----------|
|
||||
| V1.0 | 2026-03-16 | 創建文件 | Warren | OpenCode / MiniMax M2.5 |
|
||||
|
||||
---
|
||||
|
||||
本文檔定義 Momentry Core 系統中影片 chunks 的切分原則與資料結構。
|
||||
|
||||
---
|
||||
@@ -579,7 +595,518 @@ TimeBased Chunks (4 個, 重疊 2秒):
|
||||
|
||||
---
|
||||
|
||||
## 10. 相關文件
|
||||
## 10. 資料庫儲存
|
||||
|
||||
### 10.1 PostgreSQL 儲存
|
||||
|
||||
#### Table Schema
|
||||
|
||||
```sql
|
||||
CREATE TABLE chunks (
|
||||
id BIGSERIAL PRIMARY KEY,
|
||||
uuid VARCHAR(16) NOT NULL,
|
||||
chunk_id VARCHAR(64) NOT NULL,
|
||||
chunk_index INTEGER NOT NULL,
|
||||
chunk_type VARCHAR(32) NOT NULL,
|
||||
start_time DOUBLE PRECISION NOT NULL,
|
||||
start_frame BIGINT NOT NULL,
|
||||
end_time DOUBLE PRECISION NOT NULL,
|
||||
end_frame BIGINT NOT NULL,
|
||||
fps VARCHAR(16) NOT NULL,
|
||||
fps_value DOUBLE PRECISION NOT NULL,
|
||||
content JSONB NOT NULL,
|
||||
metadata JSONB,
|
||||
vector_id VARCHAR(64),
|
||||
created_at TIMESTAMP WITH TIME ZONE DEFAULT NOW(),
|
||||
updated_at TIMESTAMP WITH TIME ZONE DEFAULT NOW(),
|
||||
UNIQUE(uuid, chunk_id)
|
||||
);
|
||||
|
||||
-- 索引
|
||||
CREATE INDEX idx_chunks_uuid ON chunks(uuid);
|
||||
CREATE INDEX idx_chunks_type ON chunks(chunk_type);
|
||||
CREATE INDEX idx_chunks_time ON chunks(start_time, end_time);
|
||||
CREATE INDEX idx_chunks_uuid_type ON chunks(uuid, chunk_type);
|
||||
CREATE INDEX idx_chunks_vector_id ON chunks(vector_id);
|
||||
```
|
||||
|
||||
#### 儲存範例
|
||||
|
||||
```rust
|
||||
pub async fn store_chunk_to_postgres(db: &PostgresDb, chunk: &Chunk) -> Result<()> {
|
||||
sqlx::query!(
|
||||
r#"
|
||||
INSERT INTO chunks (
|
||||
uuid, chunk_id, chunk_index, chunk_type,
|
||||
start_time, start_frame, end_time, end_frame,
|
||||
fps, fps_value, content, metadata, vector_id
|
||||
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||
ON CONFLICT (uuid, chunk_id) DO UPDATE SET
|
||||
content = EXCLUDED.content,
|
||||
metadata = EXCLUDED.metadata,
|
||||
vector_id = EXCLUDED.vector_id,
|
||||
updated_at = NOW()
|
||||
"#,
|
||||
chunk.uuid,
|
||||
chunk.chunk_id,
|
||||
chunk.chunk_index as i32,
|
||||
chunk.chunk_type.as_str(),
|
||||
chunk.start_time,
|
||||
chunk.start_frame,
|
||||
chunk.end_time,
|
||||
chunk.end_frame,
|
||||
chunk.fps,
|
||||
chunk.fps_value,
|
||||
serde_json::to_value(&chunk.content)?,
|
||||
serde_json::to_value(&chunk.metadata)?,
|
||||
chunk.vector_id,
|
||||
)
|
||||
.execute(&db.pool)
|
||||
.await?;
|
||||
Ok(())
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### 10.2 MongoDB 儲存
|
||||
|
||||
#### Collection Schema
|
||||
|
||||
```javascript
|
||||
// chunks collection
|
||||
{
|
||||
_id: ObjectId,
|
||||
uuid: "1636719dc31f78ac",
|
||||
chunk_id: "sentence_0001",
|
||||
chunk_index: 1,
|
||||
chunk_type: "sentence",
|
||||
start_time: 10.5,
|
||||
start_frame: 252,
|
||||
end_time: 15.75,
|
||||
end_frame: 378,
|
||||
fps: "24/1",
|
||||
fps_value: 24.0,
|
||||
content: {
|
||||
text: "Hello world, this is a test",
|
||||
text_normalized: "hello world this is a test",
|
||||
word_count: 7,
|
||||
char_count: 34
|
||||
},
|
||||
metadata: {
|
||||
source: "asr",
|
||||
confidence: 0.95,
|
||||
language: "en"
|
||||
},
|
||||
vector_id: "vec_sentence_0001",
|
||||
created_at: ISODate("2026-03-16T10:00:00Z"),
|
||||
updated_at: ISODate("2026-03-16T10:00:00Z")
|
||||
}
|
||||
|
||||
// 索引
|
||||
db.chunks.createIndex({ uuid: 1 })
|
||||
db.chunks.createIndex({ chunk_type: 1 })
|
||||
db.chunks.createIndex({ start_time: 1, end_time: 1 })
|
||||
db.chunks.createIndex({ vector_id: 1 })
|
||||
db.chunks.createIndex({ uuid: 1, chunk_type: 1 })
|
||||
```
|
||||
|
||||
#### 儲存範例
|
||||
|
||||
```rust
|
||||
pub async fn store_chunk_to_mongodb(db: &MongoDb, chunk: &Chunk) -> Result<()> {
|
||||
let doc = bson::doc! {
|
||||
"uuid": chunk.uuid,
|
||||
"chunk_id": chunk.chunk_id,
|
||||
"chunk_index": chunk.chunk_index,
|
||||
"chunk_type": chunk.chunk_type.as_str(),
|
||||
"start_time": chunk.start_time,
|
||||
"start_frame": chunk.start_frame,
|
||||
"end_time": chunk.end_time,
|
||||
"end_frame": chunk.end_frame,
|
||||
"fps": chunk.fps,
|
||||
"fps_value": chunk.fps_value,
|
||||
"content": serde_json::to_value(&chunk.content)?,
|
||||
"metadata": serde_json::to_value(&chunk.metadata)?,
|
||||
"vector_id": chunk.vector_id,
|
||||
"created_at": chrono::Utc::now(),
|
||||
"updated_at": chrono::Utc::now()
|
||||
};
|
||||
|
||||
let collection = db.database("momentry").collection("chunks");
|
||||
collection.update_one(
|
||||
doc! { "uuid": &chunk.uuid, "chunk_id": &chunk.chunk_id },
|
||||
doc! { "$set": doc },
|
||||
UpdateOptions::builder().upsert(true).build(),
|
||||
).await?;
|
||||
Ok(())
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 11. 向量儲存設計
|
||||
|
||||
### 11.1 設計原則
|
||||
|
||||
**統一向量 ID 格式**,確保 Qdrant 與 PostgreSQL 相容:
|
||||
|
||||
```
|
||||
{chunk_type}_{chunk_index:04}
|
||||
|
||||
範例:
|
||||
sentence_0001
|
||||
cut_0002
|
||||
time_based_0015
|
||||
```
|
||||
|
||||
### 11.2 Qdrant Collection
|
||||
|
||||
#### 建立 Collection
|
||||
|
||||
```bash
|
||||
# 使用 Qdrant client 建立 collection
|
||||
curl -X PUT http://localhost:6333/collections/chunks \
|
||||
-H "Content-Type: application/json" \
|
||||
-H "api-key: Test3200Test3200Test3200" \
|
||||
-d '{
|
||||
"vectors": {
|
||||
"size": 768,
|
||||
"distance": "Cosine"
|
||||
}
|
||||
}'
|
||||
```
|
||||
|
||||
#### Point 結構
|
||||
|
||||
```json
|
||||
{
|
||||
"id": "sentence_0001",
|
||||
"vector": [0.123, -0.456, ...],
|
||||
"payload": {
|
||||
"uuid": "1636719dc31f78ac",
|
||||
"chunk_id": "sentence_0001",
|
||||
"chunk_type": "sentence",
|
||||
"chunk_index": 1,
|
||||
"start_time": 10.5,
|
||||
"end_time": 15.75,
|
||||
"text": "Hello world, this is a test",
|
||||
"metadata": {
|
||||
"confidence": 0.95,
|
||||
"language": "en"
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
#### Rust 結構
|
||||
|
||||
```rust
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct VectorPoint {
|
||||
pub id: String,
|
||||
pub vector: Vec<f32>,
|
||||
pub payload: VectorPayload,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct VectorPayload {
|
||||
pub uuid: String,
|
||||
pub chunk_id: String,
|
||||
pub chunk_type: String,
|
||||
pub chunk_index: u32,
|
||||
pub start_time: f64,
|
||||
pub end_time: f64,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub text: Option<String>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub scene_id: Option<i32>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub segment_number: Option<i32>,
|
||||
pub metadata: Option<serde_json::Value>,
|
||||
}
|
||||
```
|
||||
|
||||
### 11.3 PostgreSQL Vector 儲存
|
||||
|
||||
#### Table Schema
|
||||
|
||||
```sql
|
||||
-- 使用 pgvector 擴展
|
||||
CREATE EXTENSION IF NOT EXISTS vector;
|
||||
|
||||
CREATE TABLE chunk_vectors (
|
||||
id BIGSERIAL PRIMARY KEY,
|
||||
vector_id VARCHAR(64) NOT NULL UNIQUE,
|
||||
uuid VARCHAR(16) NOT NULL,
|
||||
chunk_id VARCHAR(64) NOT NULL,
|
||||
chunk_type VARCHAR(32) NOT NULL,
|
||||
chunk_index INTEGER NOT NULL,
|
||||
start_time DOUBLE PRECISION NOT NULL,
|
||||
end_time DOUBLE PRECISION NOT NULL,
|
||||
embedding vector(768) NOT NULL,
|
||||
metadata JSONB,
|
||||
created_at TIMESTAMP WITH TIME ZONE DEFAULT NOW(),
|
||||
|
||||
FOREIGN KEY (uuid, chunk_id) REFERENCES chunks(uuid, chunk_id)
|
||||
);
|
||||
|
||||
-- 向量檢索索引 (IVFFlat)
|
||||
CREATE INDEX idx_chunk_vectors_embedding
|
||||
ON chunk_vectors
|
||||
USING ivfflat (embedding vector_cosine_ops)
|
||||
WITH (lists = 100);
|
||||
|
||||
-- 查詢索引
|
||||
CREATE INDEX idx_chunk_vectors_uuid ON chunk_vectors(uuid);
|
||||
CREATE INDEX idx_chunk_vectors_type ON chunk_vectors(chunk_type);
|
||||
```
|
||||
|
||||
#### 儲存範例
|
||||
|
||||
```rust
|
||||
pub async fn store_vector_to_postgres(db: &PostgresDb, point: &VectorPoint) -> Result<()> {
|
||||
sqlx::query!(
|
||||
r#"
|
||||
INSERT INTO chunk_vectors (
|
||||
vector_id, uuid, chunk_id, chunk_type, chunk_index,
|
||||
start_time, end_time, embedding, metadata
|
||||
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||
ON CONFLICT (vector_id) DO UPDATE SET
|
||||
embedding = EXCLUDED.embedding,
|
||||
metadata = EXCLUDED.metadata
|
||||
"#,
|
||||
point.id,
|
||||
point.payload.uuid,
|
||||
point.payload.chunk_id,
|
||||
point.payload.chunk_type,
|
||||
point.payload.chunk_index as i32,
|
||||
point.payload.start_time,
|
||||
point.payload.end_time,
|
||||
point.vector,
|
||||
serde_json::to_value(&point.payload.metadata)?,
|
||||
)
|
||||
.execute(&db.pool)
|
||||
.await?;
|
||||
Ok(())
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 12. 查詢範例
|
||||
|
||||
### 12.1 語義搜尋 (Semantic Search)
|
||||
|
||||
#### 查詢類型 1: 相似文字搜尋
|
||||
|
||||
```rust
|
||||
// 搜尋與問句相似的 chunks
|
||||
pub async fn semantic_search(
|
||||
qdrant: &QdrantDb,
|
||||
query: &str,
|
||||
limit: usize,
|
||||
) -> Result<Vec<SearchResult>> {
|
||||
// 1. 將問句向量化
|
||||
let query_vector = embed_text(query).await?;
|
||||
|
||||
// 2. 搜尋 Qdrant
|
||||
let results = qdrant.search(
|
||||
"chunks",
|
||||
&query_vector,
|
||||
limit,
|
||||
Some(&Filter::must([
|
||||
Condition::Match("chunk_type", "sentence"),
|
||||
])),
|
||||
).await?;
|
||||
|
||||
Ok(results)
|
||||
}
|
||||
|
||||
// 使用範例
|
||||
let results = semantic_search(&qdrant, "找出有人在說話的片段", 10).await?;
|
||||
for r in results {
|
||||
println!("{}: {:.3}", r.payload.chunk_id, r.score);
|
||||
println!(" Time: {}s - {}s", r.payload.start_time, r.payload.end_time);
|
||||
println!(" Text: {:?}", r.payload.text);
|
||||
}
|
||||
```
|
||||
|
||||
#### 查詢類型 2: 語音/文字混合搜尋
|
||||
|
||||
```sql
|
||||
-- PostgreSQL: 搜尋特定文字的 chunks
|
||||
SELECT
|
||||
c.chunk_id,
|
||||
c.chunk_type,
|
||||
c.start_time,
|
||||
c.end_time,
|
||||
c.content->>'text' as text,
|
||||
v.embedding <=> query_embedding('找出開車的場景') as similarity
|
||||
FROM chunks c
|
||||
LEFT JOIN chunk_vectors v ON c.chunk_id = v.chunk_id
|
||||
WHERE c.chunk_type = 'sentence'
|
||||
AND c.content->>'text' ILIKE '%car%'
|
||||
ORDER BY v.embedding <=> query_embedding('找出開車的場景')
|
||||
LIMIT 10;
|
||||
```
|
||||
|
||||
### 12.2 時間範圍搜尋
|
||||
|
||||
#### 查詢類型 3: 特定時間範圍
|
||||
|
||||
```rust
|
||||
// 找出 30-60 秒之間的所有 chunks
|
||||
pub async fn search_by_time_range(
|
||||
db: &PostgresDb,
|
||||
uuid: &str,
|
||||
start: f64,
|
||||
end: f64,
|
||||
) -> Result<Vec<Chunk>> {
|
||||
let chunks = sqlx::query_as!(
|
||||
Chunk,
|
||||
r#"
|
||||
SELECT * FROM chunks
|
||||
WHERE uuid = $1
|
||||
AND start_time < $3
|
||||
AND end_time > $2
|
||||
ORDER BY chunk_type, chunk_index
|
||||
"#,
|
||||
uuid, start, end
|
||||
)
|
||||
.fetch_all(&db.pool)
|
||||
.await?;
|
||||
Ok(chunks)
|
||||
}
|
||||
|
||||
// 使用範例
|
||||
let chunks = search_by_time_range(&db, "1636719dc31f78ac", 30.0, 60.0).await?;
|
||||
```
|
||||
|
||||
```javascript
|
||||
// MongoDB: 時間範圍查詢
|
||||
db.chunks.find({
|
||||
uuid: "1636719dc31f78ac",
|
||||
start_time: { $lt: 60 },
|
||||
end_time: { $gt: 30 }
|
||||
}).sort({ chunk_type: 1, chunk_index: 1 })
|
||||
```
|
||||
|
||||
### 12.3 混合搜尋 (Hybrid Search)
|
||||
|
||||
#### 查詢類型 4: 文字關鍵詞 + 向量相似度
|
||||
|
||||
```rust
|
||||
// 結合關鍵詞匹配與向量相似度
|
||||
pub async fn hybrid_search(
|
||||
db: &PostgresDb,
|
||||
qdrant: &QdrantDb,
|
||||
query: &str,
|
||||
keywords: &[&str],
|
||||
limit: usize,
|
||||
) -> Result<Vec<HybridResult>> {
|
||||
// 1. 向量搜尋
|
||||
let query_vector = embed_text(query).await?;
|
||||
let vector_results = qdrant.search("chunks", &query_vector, limit * 2, None).await?;
|
||||
|
||||
// 2. 關鍵詞過濾
|
||||
let keyword_filter: Vec<_> = keywords.iter()
|
||||
.map(|k| format!("%{}%", k))
|
||||
.collect();
|
||||
|
||||
let filtered: Vec<_> = vector_results.into_iter()
|
||||
.filter(|r| {
|
||||
if let Some(text) = &r.payload.text {
|
||||
keyword_filter.iter().any(|k| text.contains(k.as_str()))
|
||||
} else {
|
||||
false
|
||||
}
|
||||
})
|
||||
.take(limit)
|
||||
.collect();
|
||||
|
||||
Ok(filtered)
|
||||
}
|
||||
```
|
||||
|
||||
### 12.4 場景搜尋
|
||||
|
||||
#### 查詢類型 5: 找出特定場景
|
||||
|
||||
```sql
|
||||
-- PostgreSQL: 找出特定場景 ID 的 chunks
|
||||
SELECT * FROM chunks
|
||||
WHERE uuid = '1636719dc31f78ac'
|
||||
AND chunk_type = 'cut'
|
||||
AND (content->>'scene_id')::int = 5;
|
||||
|
||||
-- 找出包含轉場效果的 chunks
|
||||
SELECT * FROM chunks
|
||||
WHERE uuid = '1636719dc31f78ac'
|
||||
AND chunk_type = 'cut'
|
||||
AND content->>'transition_type' = 'dissolve';
|
||||
```
|
||||
|
||||
### 12.5 影片摘要
|
||||
|
||||
#### 查詢類型 6: 產生影片摘要
|
||||
|
||||
```sql
|
||||
-- 合併影片所有語句
|
||||
SELECT
|
||||
string_agg(content->>'text', ' ' ORDER BY start_time) as full_transcript
|
||||
FROM chunks
|
||||
WHERE uuid = '1636719dc31f78ac'
|
||||
AND chunk_type = 'sentence'
|
||||
AND content->>'text' IS NOT NULL;
|
||||
|
||||
-- 按場景聚合文字
|
||||
SELECT
|
||||
content->>'scene_id' as scene,
|
||||
string_agg(content->>'text', ' ' ORDER BY start_time) as scene_text
|
||||
FROM chunks
|
||||
WHERE uuid = '1636719dc31f78ac'
|
||||
AND chunk_type = 'cut'
|
||||
GROUP BY content->>'scene_id'
|
||||
ORDER BY MIN(start_time);
|
||||
```
|
||||
|
||||
### 12.6 常見查詢模式
|
||||
|
||||
| 查詢類型 | 描述 | 資料庫 | SQL/程式碼 |
|
||||
|----------|------|--------|-------------|
|
||||
| 語義搜尋 | 找相似內容 | Qdrant | `search(vector, limit)` |
|
||||
| 關鍵詞搜尋 | 精確文字匹配 | PostgreSQL | `ILIKE '%keyword%'` |
|
||||
| 時間範圍 | 特定時段 | Both | `start_time < end AND end_time > start` |
|
||||
| 場景搜尋 | 特定鏡頭 | PostgreSQL | `scene_id = N` |
|
||||
| 混合搜尋 | 向量+關鍵詞 | Both |結合以上兩種 |
|
||||
| 摘要產生 | 合併文字 | PostgreSQL | `string_agg()` |
|
||||
|
||||
---
|
||||
|
||||
## 13. 資料庫選擇建議
|
||||
|
||||
### 13.1 儲存策略
|
||||
|
||||
| 資料類型 | 主要儲存 | 備份/查詢 | 說明 |
|
||||
|----------|----------|-----------|------|
|
||||
| **Chunk 元數據** | PostgreSQL | MongoDB | 結構化查詢為主 |
|
||||
| **向量資料** | Qdrant | PostgreSQL | 向量搜尋為主 |
|
||||
| **全文檢索** | PostgreSQL | - | 關鍵詞搜尋 |
|
||||
| **日誌/歷史** | MongoDB | - | 靈活性為主 |
|
||||
|
||||
### 13.2 讀寫模式
|
||||
|
||||
| 場景 | 寫入 | 讀取 |
|
||||
|------|------|------|
|
||||
| **影片處理** | PostgreSQL + Qdrant | - |
|
||||
| **語義搜尋** | - | Qdrant |
|
||||
| **時間軸瀏覽** | - | PostgreSQL |
|
||||
| **系統分析** | MongoDB | MongoDB |
|
||||
|
||||
---
|
||||
|
||||
## 14. 相關文件
|
||||
|
||||
- [JSON_OUTPUT_SPEC.md](./JSON_OUTPUT_SPEC.md) - JSON 輸出規範
|
||||
- [RUST_DEVELOPMENT.md](./RUST_DEVELOPMENT.md) - Rust 開發規範
|
||||
|
||||
Reference in New Issue
Block a user