feat: backup architecture docs, source code, and scripts
This commit is contained in:
92
src/bin/migrate_chinese_text.rs
Normal file
92
src/bin/migrate_chinese_text.rs
Normal file
@@ -0,0 +1,92 @@
|
||||
// Migration script to tokenize existing Chinese text in the database
|
||||
// Usage: cargo run --bin migrate_chinese_text
|
||||
|
||||
use dotenv;
|
||||
use momentry_core::core::text::tokenizer::tokenize_chinese_text;
|
||||
use sqlx::{postgres::PgPoolOptions, Row};
|
||||
use std::env;
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> Result<(), Box<dyn std::error::Error>> {
|
||||
// Load environment variables from .env file
|
||||
dotenv::dotenv().ok();
|
||||
|
||||
// Get database URL from environment
|
||||
let database_url = env::var("DATABASE_URL")
|
||||
.unwrap_or_else(|_| "postgres://accusys@localhost:5432/momentry".to_string());
|
||||
|
||||
println!("Connecting to database...");
|
||||
|
||||
// Create connection pool
|
||||
let pool = PgPoolOptions::new()
|
||||
.max_connections(5)
|
||||
.connect(&database_url)
|
||||
.await?;
|
||||
|
||||
println!("Fetching Chinese chunks from database...");
|
||||
|
||||
// Get all chunks with Chinese text using raw query to avoid sqlx macro issues
|
||||
let query = r#"
|
||||
SELECT id, text_content, content->'data'->>'text' as chinese_text, content->>'text' as english_text
|
||||
FROM chunks
|
||||
WHERE text_content ~ '[\u4e00-\u9fff]'
|
||||
ORDER BY id
|
||||
"#;
|
||||
|
||||
let rows = sqlx::query(query).fetch_all(&pool).await?;
|
||||
|
||||
println!("Found {} Chinese chunks to process", rows.len());
|
||||
|
||||
let mut updated_count = 0;
|
||||
|
||||
for row in &rows {
|
||||
let id: i32 = row.get(0);
|
||||
let text_content: Option<String> = row.get(1);
|
||||
let chinese_text: Option<String> = row.get(2);
|
||||
let english_text: Option<String> = row.get(3);
|
||||
|
||||
// Clone text_content for later comparison
|
||||
let text_content_clone = text_content.clone();
|
||||
|
||||
// Determine the original text (prioritize chinese_text from content->'data'->>'text')
|
||||
let original_text = if let Some(ref chinese_text) = chinese_text {
|
||||
chinese_text.as_str()
|
||||
} else if let Some(ref english_text) = english_text {
|
||||
english_text.as_str()
|
||||
} else {
|
||||
text_content.as_deref().unwrap_or("")
|
||||
};
|
||||
|
||||
// Tokenize the text
|
||||
let tokenized_text = tokenize_chinese_text(original_text);
|
||||
|
||||
// Check if tokenization changed the text
|
||||
let current_text = text_content_clone.unwrap_or_default();
|
||||
if current_text == tokenized_text {
|
||||
println!("Skipping chunk {} - already tokenized", id);
|
||||
continue;
|
||||
}
|
||||
|
||||
println!("Updating chunk {}:", id);
|
||||
println!(" Original: {}", original_text);
|
||||
println!(" Tokenized: {}", tokenized_text);
|
||||
|
||||
// Update the chunk
|
||||
sqlx::query("UPDATE chunks SET text_content = $1 WHERE id = $2")
|
||||
.bind(&tokenized_text)
|
||||
.bind(id)
|
||||
.execute(&pool)
|
||||
.await?;
|
||||
|
||||
updated_count += 1;
|
||||
}
|
||||
|
||||
println!("\nMigration completed!");
|
||||
println!(
|
||||
"Updated {} out of {} Chinese chunks",
|
||||
updated_count,
|
||||
rows.len()
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
Reference in New Issue
Block a user