feat: backup architecture docs, source code, and scripts

2026-04-25 17:15:45 +08:00
parent 59809dae1f
commit 1f84e5469f
368 changed files with 146329 additions and 261 deletions
--- a/src/bin/migrate_chinese_text.rs
+++ b/src/bin/migrate_chinese_text.rs
@@ -0,0 +1,92 @@
+// Migration script to tokenize existing Chinese text in the database
+// Usage: cargo run --bin migrate_chinese_text
+
+use dotenv;
+use momentry_core::core::text::tokenizer::tokenize_chinese_text;
+use sqlx::{postgres::PgPoolOptions, Row};
+use std::env;
+
+#[tokio::main]
+async fn main() -> Result<(), Box<dyn std::error::Error>> {
+    // Load environment variables from .env file
+    dotenv::dotenv().ok();
+
+    // Get database URL from environment
+    let database_url = env::var("DATABASE_URL")
+        .unwrap_or_else(|_| "postgres://accusys@localhost:5432/momentry".to_string());
+
+    println!("Connecting to database...");
+
+    // Create connection pool
+    let pool = PgPoolOptions::new()
+        .max_connections(5)
+        .connect(&database_url)
+        .await?;
+
+    println!("Fetching Chinese chunks from database...");
+
+    // Get all chunks with Chinese text using raw query to avoid sqlx macro issues
+    let query = r#"
+        SELECT id, text_content, content->'data'->>'text' as chinese_text, content->>'text' as english_text
+        FROM chunks 
+        WHERE text_content ~ '[\u4e00-\u9fff]'
+        ORDER BY id
+    "#;
+
+    let rows = sqlx::query(query).fetch_all(&pool).await?;
+
+    println!("Found {} Chinese chunks to process", rows.len());
+
+    let mut updated_count = 0;
+
+    for row in &rows {
+        let id: i32 = row.get(0);
+        let text_content: Option<String> = row.get(1);
+        let chinese_text: Option<String> = row.get(2);
+        let english_text: Option<String> = row.get(3);
+
+        // Clone text_content for later comparison
+        let text_content_clone = text_content.clone();
+
+        // Determine the original text (prioritize chinese_text from content->'data'->>'text')
+        let original_text = if let Some(ref chinese_text) = chinese_text {
+            chinese_text.as_str()
+        } else if let Some(ref english_text) = english_text {
+            english_text.as_str()
+        } else {
+            text_content.as_deref().unwrap_or("")
+        };
+
+        // Tokenize the text
+        let tokenized_text = tokenize_chinese_text(original_text);
+
+        // Check if tokenization changed the text
+        let current_text = text_content_clone.unwrap_or_default();
+        if current_text == tokenized_text {
+            println!("Skipping chunk {} - already tokenized", id);
+            continue;
+        }
+
+        println!("Updating chunk {}:", id);
+        println!("  Original: {}", original_text);
+        println!("  Tokenized: {}", tokenized_text);
+
+        // Update the chunk
+        sqlx::query("UPDATE chunks SET text_content = $1 WHERE id = $2")
+            .bind(&tokenized_text)
+            .bind(id)
+            .execute(&pool)
+            .await?;
+
+        updated_count += 1;
+    }
+
+    println!("\nMigration completed!");
+    println!(
+        "Updated {} out of {} Chinese chunks",
+        updated_count,
+        rows.len()
+    );
+
+    Ok(())
+}