perf: parallelize update task

I-Info · I-Info · commit ac3a73cab915 · 2025-02-21T18:09:46.000+08:00
diff --git a/packages/service/common/vectorStore/pg/index.ts b/packages/service/common/vectorStore/pg/index.ts
@@ -169,13 +169,13 @@ class PgClass {
     const pg = await connectPg();
     return pg.query<{ id: string }>(sql);
   }
-  async query<T extends QueryResultRow = any>(sql: string) {
+  async query<T extends QueryResultRow = any>(sql: string, warning = true) {
     const pg = await connectPg();
     const start = Date.now();
     return pg.query<T>(sql).then((res) => {
       const time = Date.now() - start;
 
-      if (time > 300) {
+      if (warning && time > 300) {
         addLog.warn(`pg query time: ${time}ms, sql: ${sql}`);
       }
 
diff --git a/projects/app/src/pages/api/admin/inithalfvec.ts b/projects/app/src/pages/api/admin/inithalfvec.ts
@@ -26,37 +26,99 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse)
       console.log('halfvector column added');
     }
 
-    let rowsUpdated;
-    let retryCount = 0;
-    do {
-      try {
-        rowsUpdated = await PgClient.query(`
-            WITH updated AS (
-              UPDATE ${DatasetVectorTableName}
-              SET halfvector = vector::halfvec(1536)
-              WHERE id IN (
-                  SELECT id
-                  FROM ${DatasetVectorTableName}
-                  WHERE halfvector IS NULL
-                  LIMIT 1000
-              )
-              RETURNING 1
-            )
-            SELECT count(*) FROM updated;
-          `);
-        console.log('rowsUpdated:', rowsUpdated.rows[0].count);
-      } catch (error) {
-        console.error('Error updating halfvector:', error);
-        retryCount++;
+    const maxIdResult = await PgClient.query(
+      `SELECT MAX(id) as max_id FROM ${DatasetVectorTableName}`
+    );
+    const maxId: number = maxIdResult.rows[0].max_id;
+
+    if (!maxId) {
+      console.warn('No data in the table: empty max_id');
+      jsonRes(res, { code: 500, error: 'No data in the table: empty max_id' });
+      return;
+    }
+
+    const batchSize = 25;
+    const numBatches = Math.ceil(maxId / batchSize);
+
+    const tasks: (() => Promise<void>)[] = [];
+    let totalRowsUpdated = 0;
+    let lastLoggedTime = Date.now();
+    let lastLoggedRows = 0;
+
+    const logUpdateSpeed = () => {
+      const currentTime = Date.now();
+      const timeElapsed = (currentTime - lastLoggedTime) / 1000; // seconds
+      const rowsUpdated = totalRowsUpdated - lastLoggedRows;
+      const speed = rowsUpdated / timeElapsed; // rows per second
+      console.log(`Update speed: ${speed.toFixed(2)} rows/s`);
+      lastLoggedTime = currentTime;
+      lastLoggedRows = totalRowsUpdated;
+    };
+
+    for (let i = 0; i < numBatches; i++) {
+      const startId = i * batchSize;
+      const endId = startId + batchSize;
+
+      const asyncUpdate = async () => {
+        let retryCount = 0;
+        do {
+          try {
+            const rowsUpdated = await PgClient.query(
+              `
+                UPDATE ${DatasetVectorTableName}
+                SET halfvector = vector::halfvec(1536)
+                WHERE id >= ${startId} AND id < ${endId} AND halfvector IS NULL;
+            `,
+              false
+            );
+            if (rowsUpdated?.rowCount) {
+              totalRowsUpdated += rowsUpdated.rowCount;
+              console.log(`Batch ${i + 1} - rowsUpdated: ${rowsUpdated.rowCount}`);
+            }
+            break;
+          } catch (error) {
+            console.error(`Error updating halfvector in batch ${i + 1}:`, error);
+            retryCount++;
+          }
+        } while (retryCount < 3);
+
+        if (retryCount >= 3) {
+          console.error(`Failed to update halfvector in batch ${i + 1} after 3 retries`);
+          Promise.reject(new Error('Failed to update halfvector in batch'));
+        }
+      };
+
+      tasks.push(asyncUpdate);
+    }
+
+    // randomize task list
+    tasks.sort(() => Math.random() - 0.5);
+
+    let currentIdx = 0;
+    const executor = async () => {
+      console.log(`Executing tasks from: ${currentIdx}`);
+      let idx: number;
+      while ((idx = currentIdx++) < tasks.length) {
+        try {
+          await tasks[idx]();
+        } catch (error) {
+          console.error(`Error updating halfvector in task ${idx}`, error);
+        }
       }
-    } while (retryCount < 3 && rowsUpdated!.rows[0].count > 0);
-
-    if (retryCount >= 3) {
-      console.error('Failed to update halfvector after 3 retries');
-      return jsonRes(res, {
-        code: 500,
-        error: 'Failed to update halfvector after 3 retries'
-      });
+    };
+
+    const maxConcurrency = 20;
+    const promises = [];
+    for (let i = 0; i < maxConcurrency; ++i) {
+      promises.push(executor());
+    }
+
+    const telemetryInterval = setInterval(logUpdateSpeed, 5000);
+
+    try {
+      await Promise.all(promises);
+    } finally {
+      clearInterval(telemetryInterval);
     }
 
     console.log('halfvector column updated');
@@ -70,6 +132,13 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse)
       COMMIT;
       `
     );
+    console.log('halfvector column set not null');
+
+    // 创建索引以提升查询性能
+    await PgClient.query(`
+      CREATE INDEX CONCURRENTLY IF NOT EXISTS halfvector_index ON ${DatasetVectorTableName} USING hnsw (halfvector halfvec_ip_ops) WITH (m = 32, ef_construction = 128);
+    `);
+    console.log('halfvector index created');
 
     // 后台释放空间，避免使用 VACUUM FULL 导致锁表。
     await PgClient.query(`VACUUM ${DatasetVectorTableName};`);

Original file line number	Diff line number	Diff line change
`@@ -169,13 +169,13 @@ class PgClass {`
`169`	`169`	`const pg = await connectPg();`
`170`	`170`	`return pg.query<{ id: string }>(sql);`
`171`	`171`	`}`
`172`		`- async query<T extends QueryResultRow = any>(sql: string) {`
	`172`	`+ async query<T extends QueryResultRow = any>(sql: string, warning = true) {`
`173`	`173`	`const pg = await connectPg();`
`174`	`174`	`const start = Date.now();`
`175`	`175`	`return pg.query<T>(sql).then((res) => {`
`176`	`176`	`const time = Date.now() - start;`
`177`	`177`
`178`		`- if (time > 300) {`
	`178`	`+ if (warning && time > 300) {`
`179`	`179`	addLog.warn(`pg query time: ${time}ms, sql: ${sql}`);
`180`	`180`	`}`
`181`	`181`