93 lines
2.8 KiB
Rust
93 lines
2.8 KiB
Rust
// Migration script to tokenize existing Chinese text in the database
|
|
// Usage: cargo run --bin migrate_chinese_text
|
|
|
|
use dotenv;
|
|
use momentry_core::core::text::tokenizer::tokenize_chinese_text;
|
|
use sqlx::{postgres::PgPoolOptions, Row};
|
|
use std::env;
|
|
|
|
#[tokio::main]
|
|
async fn main() -> Result<(), Box<dyn std::error::Error>> {
|
|
// Load environment variables from .env file
|
|
dotenv::dotenv().ok();
|
|
|
|
// Get database URL from environment
|
|
let database_url = env::var("DATABASE_URL")
|
|
.unwrap_or_else(|_| "postgres://accusys@localhost:5432/momentry".to_string());
|
|
|
|
println!("Connecting to database...");
|
|
|
|
// Create connection pool
|
|
let pool = PgPoolOptions::new()
|
|
.max_connections(5)
|
|
.connect(&database_url)
|
|
.await?;
|
|
|
|
println!("Fetching Chinese chunks from database...");
|
|
|
|
// Get all chunks with Chinese text using raw query to avoid sqlx macro issues
|
|
let query = r#"
|
|
SELECT id, text_content, content->'data'->>'text' as chinese_text, content->>'text' as english_text
|
|
FROM chunks
|
|
WHERE text_content ~ '[\u4e00-\u9fff]'
|
|
ORDER BY id
|
|
"#;
|
|
|
|
let rows = sqlx::query(query).fetch_all(&pool).await?;
|
|
|
|
println!("Found {} Chinese chunks to process", rows.len());
|
|
|
|
let mut updated_count = 0;
|
|
|
|
for row in &rows {
|
|
let id: i32 = row.get(0);
|
|
let text_content: Option<String> = row.get(1);
|
|
let chinese_text: Option<String> = row.get(2);
|
|
let english_text: Option<String> = row.get(3);
|
|
|
|
// Clone text_content for later comparison
|
|
let text_content_clone = text_content.clone();
|
|
|
|
// Determine the original text (prioritize chinese_text from content->'data'->>'text')
|
|
let original_text = if let Some(ref chinese_text) = chinese_text {
|
|
chinese_text.as_str()
|
|
} else if let Some(ref english_text) = english_text {
|
|
english_text.as_str()
|
|
} else {
|
|
text_content.as_deref().unwrap_or("")
|
|
};
|
|
|
|
// Tokenize the text
|
|
let tokenized_text = tokenize_chinese_text(original_text);
|
|
|
|
// Check if tokenization changed the text
|
|
let current_text = text_content_clone.unwrap_or_default();
|
|
if current_text == tokenized_text {
|
|
println!("Skipping chunk {} - already tokenized", id);
|
|
continue;
|
|
}
|
|
|
|
println!("Updating chunk {}:", id);
|
|
println!(" Original: {}", original_text);
|
|
println!(" Tokenized: {}", tokenized_text);
|
|
|
|
// Update the chunk
|
|
sqlx::query("UPDATE chunks SET text_content = $1 WHERE id = $2")
|
|
.bind(&tokenized_text)
|
|
.bind(id)
|
|
.execute(&pool)
|
|
.await?;
|
|
|
|
updated_count += 1;
|
|
}
|
|
|
|
println!("\nMigration completed!");
|
|
println!(
|
|
"Updated {} out of {} Chinese chunks",
|
|
updated_count,
|
|
rows.len()
|
|
);
|
|
|
|
Ok(())
|
|
}
|