feat: add Vision LLM integration (CLIP + Qwen3-VL cascade)

- Add Qwen3-VL dynamic management (start/stop/status CLI)
- Add CLIP + Qwen3-VL cascade detection strategy
- Add Vision CLI commands (vision start/stop/status, detect)
- Add cascade_vision processor module
- Add clip processor module
- Add qwen_vl_manager module

Changes:
- scripts/start_qwen3vl.sh, stop_qwen3vl.sh: Qwen3-VL management scripts
- src/core/vision/: Qwen3-VL manager module
- src/core/processor/cascade_vision.rs: CLIP + Qwen3-VL cascade logic
- src/core/processor/clip.rs: CLIP classification and detection
- src/api/clip_api.rs: CLIP API endpoints
- src/cli/vision.rs: Vision CLI implementation
- src/cli/args.rs: Add Vision and Detect commands
- src/main.rs: Integrate Vision CLI
- src/core/mod.rs: Add vision module
- src/core/processor/mod.rs: Add cascade_vision module
This commit is contained in:
Accusys
2026-06-13 16:25:52 +08:00
parent 834b0d4865
commit 17e4e15860
37 changed files with 2185 additions and 294 deletions

View File

@@ -63,6 +63,7 @@ pub fn bbox_routes() -> Router<crate::api::types::AppState> {
)
.route("/api/v1/file/:file_uuid/video", get(stream_video))
.route("/api/v1/file/:file_uuid/thumbnail", get(face_thumbnail))
.route("/api/v1/file/:file_uuid/chunk/:chunk_id/thumbnail", get(chunk_thumbnail))
.route("/api/v1/file/:file_uuid/clip", get(video_clip))
}
@@ -745,13 +746,14 @@ async fn face_thumbnail(
.join(format!("{}.jpg", frame));
if cached_path.exists() {
tracing::debug!("[thumbnail] Using cached face crop: {}", cached_path.display());
let bytes = tokio::fs::read(&cached_path)
.await
.map_err(|e| {
tracing::warn!("[thumbnail] Failed to read cached file: {}", e);
StatusCode::INTERNAL_SERVER_ERROR
})?;
tracing::debug!(
"[thumbnail] Using cached face crop: {}",
cached_path.display()
);
let bytes = tokio::fs::read(&cached_path).await.map_err(|e| {
tracing::warn!("[thumbnail] Failed to read cached file: {}", e);
StatusCode::INTERNAL_SERVER_ERROR
})?;
// Validate cached JPEG
crate::core::thumbnail::validator::validate_jpeg(&bytes).map_err(|e| {
@@ -766,7 +768,7 @@ async fn face_thumbnail(
.body(Body::from(bytes))
.unwrap());
}
// Cached file not found, fallback to ffmpeg
tracing::debug!("[thumbnail] Cached file not found, falling back to ffmpeg");
}
@@ -841,6 +843,99 @@ async fn face_thumbnail(
.unwrap())
}
async fn chunk_thumbnail(
State(state): State<crate::api::types::AppState>,
Path((file_uuid, chunk_id)): Path<(String, String)>,
) -> Result<impl IntoResponse, StatusCode> {
let videos_table = schema::table_name("videos");
let chunk_table = schema::table_name("chunk");
let output_dir = crate::core::config::OUTPUT_DIR.as_str();
let cached_path = std::path::PathBuf::from(output_dir)
.join(".chunk_thumbs")
.join(&file_uuid)
.join(format!("{}.jpg", chunk_id));
if cached_path.exists() {
let bytes = tokio::fs::read(&cached_path).await.map_err(|e| {
tracing::warn!("[chunk_thumbnail] Failed to read cache: {}", e);
StatusCode::INTERNAL_SERVER_ERROR
})?;
return Ok(Response::builder()
.status(StatusCode::OK)
.header(header::CONTENT_TYPE, "image/jpeg")
.header(header::CACHE_CONTROL, "public, max-age=86400")
.body(Body::from(bytes))
.unwrap());
}
let row: (f64, f64, f64) = sqlx::query_as(&format!(
"SELECT start_time, end_time, fps FROM {} WHERE file_uuid = $1 AND chunk_id = $2 LIMIT 1",
chunk_table
))
.bind(&file_uuid)
.bind(&chunk_id)
.fetch_optional(state.db.pool())
.await
.map_err(|_| StatusCode::INTERNAL_SERVER_ERROR)?
.ok_or(StatusCode::NOT_FOUND)?;
let (start_time, end_time, fps) = row;
let start_frame = (start_time * fps).round() as i64;
let end_frame = (end_time * fps).round() as i64;
let mid_frame = (start_frame + end_frame) / 2;
let video: Option<(String, Option<i64>)> = sqlx::query_as(&format!(
"SELECT file_path, total_frames FROM {} WHERE file_uuid = $1",
videos_table
))
.bind(&file_uuid)
.fetch_optional(state.db.pool())
.await
.map_err(|_| StatusCode::INTERNAL_SERVER_ERROR)?;
let (file_path, total_frames) = video.ok_or(StatusCode::NOT_FOUND)?;
let frame = match total_frames {
Some(t) if t > 0 => mid_frame.min(t - 1).max(0),
_ => mid_frame.max(0),
};
let select = format!("select=eq(n\\,{})", frame);
let output = ffmpeg_cmd()
.args([
"-i", &file_path,
"-vf", &select,
"-frames:v", "1",
"-f", "image2pipe",
"-vcodec", "mjpeg",
"-",
])
.output()
.map_err(|_| StatusCode::INTERNAL_SERVER_ERROR)?;
if !output.status.success() {
return Err(StatusCode::INTERNAL_SERVER_ERROR);
}
crate::core::thumbnail::validator::validate_jpeg(&output.stdout).map_err(|e| {
tracing::warn!("[chunk_thumbnail] JPEG validation failed: {}", e);
StatusCode::INTERNAL_SERVER_ERROR
})?;
if let Some(parent) = cached_path.parent() {
let _ = tokio::fs::create_dir_all(parent).await;
}
let _ = tokio::fs::write(&cached_path, &output.stdout).await;
Ok(Response::builder()
.status(StatusCode::OK)
.header(header::CONTENT_TYPE, "image/jpeg")
.header(header::CACHE_CONTROL, "public, max-age=86400")
.body(Body::from(output.stdout))
.unwrap())
}
#[derive(Debug, serde::Deserialize)]
struct ClipQuery {
start_frame: Option<i64>,
@@ -945,13 +1040,17 @@ async fn stranger_video_inner(
use axum::http::header;
use uuid::Uuid;
tracing::info!("[stranger_video] Starting for file={}, stranger={}", file_uuid, stranger_id);
tracing::info!(
"[stranger_video] Starting for file={}, stranger={}",
file_uuid,
stranger_id
);
let (mode, audio) = parse_video_params(&params);
let videos_table = schema::table_name("videos");
tracing::debug!("[stranger_video] videos_table: {}", videos_table);
let row: Option<(String, f64, i32, i32)> = sqlx::query_as(&format!(
"SELECT file_path, COALESCE(fps, 24.0), COALESCE(width, 0), COALESCE(height, 0) FROM {} WHERE file_uuid = $1",
videos_table
@@ -963,18 +1062,22 @@ async fn stranger_video_inner(
tracing::error!("[stranger_video] Video query error: {}", e);
StatusCode::INTERNAL_SERVER_ERROR
})?;
let (video_path, fps, _width, _height) = row.ok_or_else(|| {
tracing::error!("[stranger_video] Video not found for uuid={}", file_uuid);
StatusCode::NOT_FOUND
})?;
tracing::info!("[stranger_video] Found video: path={}, fps={}", video_path, fps);
tracing::info!(
"[stranger_video] Found video: path={}, fps={}",
video_path,
fps
);
// Query face detections by stranger_id directly
let face_table = schema::table_name("face_detections");
tracing::debug!("[stranger_video] face_table: {}", face_table);
// frame_number is BIGINT (i64) in database
let rows: Vec<(i64, i32, i32, i32, i32)> = sqlx::query_as(&format!(
"SELECT frame_number, x, y, width, height FROM {} WHERE file_uuid = $1 AND stranger_id = $2 ORDER BY frame_number",
@@ -982,15 +1085,18 @@ async fn stranger_video_inner(
))
.bind(&file_uuid).bind(stranger_id)
.fetch_all(state.db.pool()).await
.unwrap_or_else(|e| {
tracing::error!("[stranger_video] Face query error: {}", e);
vec![]
.unwrap_or_else(|e| {
tracing::error!("[stranger_video] Face query error: {}", e);
vec![]
});
tracing::info!("[stranger_video] Found {} faces", rows.len());
if rows.is_empty() {
tracing::error!("[stranger_video] No faces found for stranger_id={}", stranger_id);
tracing::error!(
"[stranger_video] No faces found for stranger_id={}",
stranger_id
);
return Err(StatusCode::NOT_FOUND);
}
@@ -1004,8 +1110,13 @@ async fn stranger_video_inner(
let duration = (last_frame - first_frame) as f64 / fps + padding * 2.0;
let seek = (start_sec - padding).max(0.0);
tracing::info!("[stranger_video] Frame range: {} - {}, time: {:.2}s - {:.2}s",
first_frame, last_frame, seek, seek + duration);
tracing::info!(
"[stranger_video] Frame range: {} - {}, time: {:.2}s - {:.2}s",
first_frame,
last_frame,
seek,
seek + duration
);
// Only support normal mode for stranger video
let tmp = std::env::temp_dir().join(format!("stranger_{}.mp4", Uuid::new_v4()));
@@ -1017,37 +1128,98 @@ async fn stranger_video_inner(
cmd_args.push("-an");
}
cmd_args.extend_from_slice(&["-y", &tmp_str]);
tracing::debug!("[stranger_video] ffmpeg args: {:?}", cmd_args);
let result = ffmpeg_cmd()
.args(&cmd_args)
.output()
.map_err(|e| {
tracing::error!("[stranger_video] ffmpeg spawn error: {}", e);
StatusCode::INTERNAL_SERVER_ERROR
})?;
let result = ffmpeg_cmd().args(&cmd_args).output().map_err(|e| {
tracing::error!("[stranger_video] ffmpeg spawn error: {}", e);
StatusCode::INTERNAL_SERVER_ERROR
})?;
if !result.status.success() {
tracing::error!("[stranger_video] ffmpeg failed: {}", String::from_utf8_lossy(&result.stderr));
tracing::error!(
"[stranger_video] ffmpeg failed: {}",
String::from_utf8_lossy(&result.stderr)
);
return Err(StatusCode::INTERNAL_SERVER_ERROR);
}
tracing::info!("[stranger_video] ffmpeg success, output size: {} bytes", result.stdout.len());
let data = tokio::fs::read(&tmp)
.await
.map_err(|e| {
tracing::error!("[stranger_video] Read output error: {}", e);
StatusCode::INTERNAL_SERVER_ERROR
})?;
tracing::info!(
"[stranger_video] ffmpeg success, output size: {} bytes",
result.stdout.len()
);
let data = tokio::fs::read(&tmp).await.map_err(|e| {
tracing::error!("[stranger_video] Read output error: {}", e);
StatusCode::INTERNAL_SERVER_ERROR
})?;
let _ = std::fs::remove_file(&tmp);
tracing::info!("[stranger_video] Returning video, size: {} bytes", data.len());
tracing::info!(
"[stranger_video] Returning video, size: {} bytes",
data.len()
);
Ok(Response::builder()
.header(header::CONTENT_TYPE, "video/mp4")
.header(header::CONTENT_LENGTH, data.len())
.body(Body::from(data))
.unwrap())
}
// ── Media Proxy: Unified endpoint for WordPress frontend ──
// Accepts the same query param format as the (inactive) WordPress snippet 61.
// Dispatches to the appropriate existing handler based on `type`.
// Caddy rewrites /wp-json/momentry/v1/media → /api/v1/media-proxy{?}
/// Dispatch query params to the appropriate handler
async fn media_proxy_handler(
State(state): State<crate::api::types::AppState>,
Query(params): Query<std::collections::HashMap<String, String>>,
request: axum::http::Request<Body>,
) -> Result<Response, StatusCode> {
let uuid = params
.get("uuid")
.or_else(|| params.get("file_uuid"))
.ok_or(StatusCode::BAD_REQUEST)?;
let type_ = params
.get("type")
.map(String::as_str)
.ok_or(StatusCode::BAD_REQUEST)?;
match type_ {
"thumbnail" => {
let thumb_query = ThumbQuery {
frame: params.get("frame").and_then(|v| v.parse().ok()),
x: params.get("x").and_then(|v| v.parse().ok()),
y: params.get("y").and_then(|v| v.parse().ok()),
w: params.get("w").and_then(|v| v.parse().ok()),
h: params.get("h").and_then(|v| v.parse().ok()),
trace_id: params.get("trace_id").and_then(|v| v.parse().ok()),
};
face_thumbnail(State(state), Path(uuid.clone()), Query(thumb_query))
.await
.map(IntoResponse::into_response)
}
"video" => stream_video(State(state), Path(uuid.clone()), Query(params), request)
.await
.map(IntoResponse::into_response),
"chunk_thumbnail" => {
let chunk_id = params
.get("chunk_id")
.ok_or(StatusCode::BAD_REQUEST)?;
chunk_thumbnail(
State(state),
Path((uuid.clone(), chunk_id.clone())),
)
.await
.map(IntoResponse::into_response)
}
_ => Err(StatusCode::BAD_REQUEST),
}
}
pub fn media_proxy_routes() -> Router<crate::api::types::AppState> {
Router::new().route("/api/v1/media-proxy", get(media_proxy_handler))
}