Files
momentry_core/src/bin/migrate_chinese_text.rs

93 lines
2.8 KiB
Rust

// Migration script to tokenize existing Chinese text in the database
// Usage: cargo run --bin migrate_chinese_text
use dotenv;
use momentry_core::core::text::tokenizer::tokenize_chinese_text;
use sqlx::{postgres::PgPoolOptions, Row};
use std::env;
#[tokio::main]
async fn main() -> Result<(), Box<dyn std::error::Error>> {
// Load environment variables from .env file
dotenv::dotenv().ok();
// Get database URL from environment
let database_url = env::var("DATABASE_URL")
.unwrap_or_else(|_| "postgres://accusys@localhost:5432/momentry".to_string());
println!("Connecting to database...");
// Create connection pool
let pool = PgPoolOptions::new()
.max_connections(5)
.connect(&database_url)
.await?;
println!("Fetching Chinese chunks from database...");
// Get all chunks with Chinese text using raw query to avoid sqlx macro issues
let query = r#"
SELECT id, text_content, content->'data'->>'text' as chinese_text, content->>'text' as english_text
FROM chunks
WHERE text_content ~ '[\u4e00-\u9fff]'
ORDER BY id
"#;
let rows = sqlx::query(query).fetch_all(&pool).await?;
println!("Found {} Chinese chunks to process", rows.len());
let mut updated_count = 0;
for row in &rows {
let id: i32 = row.get(0);
let text_content: Option<String> = row.get(1);
let chinese_text: Option<String> = row.get(2);
let english_text: Option<String> = row.get(3);
// Clone text_content for later comparison
let text_content_clone = text_content.clone();
// Determine the original text (prioritize chinese_text from content->'data'->>'text')
let original_text = if let Some(ref chinese_text) = chinese_text {
chinese_text.as_str()
} else if let Some(ref english_text) = english_text {
english_text.as_str()
} else {
text_content.as_deref().unwrap_or("")
};
// Tokenize the text
let tokenized_text = tokenize_chinese_text(original_text);
// Check if tokenization changed the text
let current_text = text_content_clone.unwrap_or_default();
if current_text == tokenized_text {
println!("Skipping chunk {} - already tokenized", id);
continue;
}
println!("Updating chunk {}:", id);
println!(" Original: {}", original_text);
println!(" Tokenized: {}", tokenized_text);
// Update the chunk
sqlx::query("UPDATE chunks SET text_content = $1 WHERE id = $2")
.bind(&tokenized_text)
.bind(id)
.execute(&pool)
.await?;
updated_count += 1;
}
println!("\nMigration completed!");
println!(
"Updated {} out of {} Chinese chunks",
updated_count,
rows.len()
);
Ok(())
}