// Migration script to tokenize existing Chinese text in the database // Usage: cargo run --bin migrate_chinese_text use dotenv; use momentry_core::core::text::tokenizer::tokenize_chinese_text; use sqlx::{postgres::PgPoolOptions, Row}; use std::env; #[tokio::main] async fn main() -> Result<(), Box> { // Load environment variables from .env file dotenv::dotenv().ok(); // Get database URL from environment let database_url = env::var("DATABASE_URL") .unwrap_or_else(|_| "postgres://accusys@localhost:5432/momentry".to_string()); println!("Connecting to database..."); // Create connection pool let pool = PgPoolOptions::new() .max_connections(5) .connect(&database_url) .await?; println!("Fetching Chinese chunks from database..."); // Get all chunks with Chinese text using raw query to avoid sqlx macro issues let query = r#" SELECT id, text_content, content->'data'->>'text' as chinese_text, content->>'text' as english_text FROM chunks WHERE text_content ~ '[\u4e00-\u9fff]' ORDER BY id "#; let rows = sqlx::query(query).fetch_all(&pool).await?; println!("Found {} Chinese chunks to process", rows.len()); let mut updated_count = 0; for row in &rows { let id: i32 = row.get(0); let text_content: Option = row.get(1); let chinese_text: Option = row.get(2); let english_text: Option = row.get(3); // Clone text_content for later comparison let text_content_clone = text_content.clone(); // Determine the original text (prioritize chinese_text from content->'data'->>'text') let original_text = if let Some(ref chinese_text) = chinese_text { chinese_text.as_str() } else if let Some(ref english_text) = english_text { english_text.as_str() } else { text_content.as_deref().unwrap_or("") }; // Tokenize the text let tokenized_text = tokenize_chinese_text(original_text); // Check if tokenization changed the text let current_text = text_content_clone.unwrap_or_default(); if current_text == tokenized_text { println!("Skipping chunk {} - already tokenized", id); continue; } println!("Updating chunk {}:", id); println!(" Original: {}", original_text); println!(" Tokenized: {}", tokenized_text); // Update the chunk sqlx::query("UPDATE chunks SET text_content = $1 WHERE id = $2") .bind(&tokenized_text) .bind(id) .execute(&pool) .await?; updated_count += 1; } println!("\nMigration completed!"); println!( "Updated {} out of {} Chinese chunks", updated_count, rows.len() ); Ok(()) }