Eastern-Research-Group · cschwinderg · Jan 13, 2025 · Jan 13, 2025 · Jan 13, 2025 · Jan 13, 2025
diff --git a/.github/workflows/dev.yml b/.github/workflows/dev.yml
@@ -2,11 +2,11 @@
 
 name: Dev Deploy
 
-# Controls when the action will run. 
+# Controls when the action will run.
 on:
   # Triggers the workflow on push or pull request events but only for the master branch    branches: [ develop ]
   push:
-    branches: [ develop ]
+    branches: [develop]
 
   # Allows you to run this workflow manually from the Actions tab
   workflow_dispatch:
@@ -18,14 +18,14 @@ jobs:
     runs-on: ubuntu-latest
     environment: dev
 
-    outputs: 
+    outputs:
       workflows: ${{ steps.filter.outputs.workflows }}
       app: ${{ steps.filter.outputs.app }}
       etl: ${{ steps.filter.outputs.etl }}
 
     # Steps represent a sequence of tasks that will be executed as part of the job
-    steps:      
-      # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it 
+    steps:
+      # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
       - uses: actions/checkout@v4
       - uses: dorny/paths-filter@v3
         id: filter
@@ -37,12 +37,12 @@ jobs:
               - 'app/**'
             etl:
               - 'etl/**'
-        
+
   app:
     # Check if this folder has any changes
     needs: changes
-    if: ${{ 
-      needs.changes.outputs.app == 'true' ||  
+    if: ${{
+      needs.changes.outputs.app == 'true' ||
       needs.changes.outputs.workflows == 'true' }}
 
     # The type of runner that the job will run on
@@ -78,19 +78,20 @@ jobs:
       MAX_QUERY_SIZE: 1000000
       SERVER_BASE_PATH: /expertquery
       SERVER_URL: https://owapps-dev.app.cloud.gov/expertquery
+      SKIP_DOCUMENTS_TEXT_QA: true
       STREAM_BATCH_SIZE: 2000
       STREAM_HIGH_WATER_MARK: 10000
 
     # Steps represent a sequence of tasks that will be executed as part of the job
     steps:
       # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
       - uses: actions/checkout@v4
-      
+
       # Set up node and npm
       - uses: actions/setup-node@v4
         with:
           node-version: "20"
-      
+
       # Run front-end processes (install, lint, test, bundle)
       - name: Cache node modules
         uses: actions/cache@v4
@@ -148,6 +149,7 @@ jobs:
           cf set-env $APP_NAME "PUBLIC_URL" "$SERVER_URL" > /dev/null
           cf set-env $APP_NAME "SERVER_BASE_PATH" "$SERVER_BASE_PATH" > /dev/null
           cf set-env $APP_NAME "SERVER_URL" "$SERVER_URL" > /dev/null
+          cf set-env $APP_NAME "SKIP_DOCUMENTS_TEXT_QA" "$SKIP_DOCUMENTS_TEXT_QA" > /dev/null
           cf set-env $APP_NAME "STREAM_BATCH_SIZE" "$STREAM_BATCH_SIZE" > /dev/null
           cf set-env $APP_NAME "STREAM_HIGH_WATER_MARK" "$STREAM_HIGH_WATER_MARK" > /dev/null
           cf set-env $APP_NAME "TZ" "America/New_York" > /dev/null
@@ -177,9 +179,9 @@ jobs:
   etl:
     # Check if this folder has any changes
     needs: changes
-    if: ${{ 
-        needs.changes.outputs.etl == 'true' ||  
-        needs.changes.outputs.workflows == 'true' }}
+    if: ${{
+      needs.changes.outputs.etl == 'true' ||
+      needs.changes.outputs.workflows == 'true' }}
 
     # The type of runner that the job will run on
     runs-on: ubuntu-latest
@@ -210,7 +212,7 @@ jobs:
     steps:
       # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
       - uses: actions/checkout@v4
-      
+
       # Set up node and npm
       - uses: actions/setup-node@v4
         with:

diff --git a/.github/workflows/staging.yml b/.github/workflows/staging.yml
@@ -79,6 +79,7 @@ jobs:
       MAX_QUERY_SIZE: 1000000
       SERVER_BASE_PATH: /expertquery
       SERVER_URL: https://owapps-stage.app.cloud.gov/expertquery
+      SKIP_DOCUMENTS_TEXT_QA: true
       STREAM_BATCH_SIZE: 2000
       STREAM_HIGH_WATER_MARK: 10000
 
@@ -149,6 +150,7 @@ jobs:
           cf set-env $APP_NAME "PUBLIC_URL" "$SERVER_URL" > /dev/null
           cf set-env $APP_NAME "SERVER_BASE_PATH" "$SERVER_BASE_PATH" > /dev/null
           cf set-env $APP_NAME "SERVER_URL" "$SERVER_URL" > /dev/null
+          cf set-env $APP_NAME "SKIP_DOCUMENTS_TEXT_QA" "$SKIP_DOCUMENTS_TEXT_QA" > /dev/null
           cf set-env $APP_NAME "STREAM_BATCH_SIZE" "$STREAM_BATCH_SIZE" > /dev/null
           cf set-env $APP_NAME "STREAM_HIGH_WATER_MARK" "$STREAM_HIGH_WATER_MARK" > /dev/null
           cf set-env $APP_NAME "TZ" "America/New_York" > /dev/null

diff --git a/app/server/app/content/swagger/api-public.json b/app/server/app/content/swagger/api-public.json
@@ -4471,7 +4471,7 @@
       "documentQueryParam": {
         "name": "documentQuery",
         "in": "query",
-        "example": ["Tuscumbia River Canal"],
+        "example": "Tuscumbia River Canal",
         "schema": {
           "type": "string"
         }

diff --git a/app/server/app/routes/attains.js b/app/server/app/routes/attains.js
@@ -464,7 +464,10 @@ function parseDocumentSearchCriteria(req, query, profile, queryParams) {
       .orderBy('rankPercent', 'desc')
       .groupBy(selectColumns.map((col) => col.name));
   } else {
-    query.select(selectColumns.map(asAlias)).orderBy('objectid', 'asc');
+    query
+      .select(selectColumns.map(asAlias))
+      .orderBy('objectid', 'asc')
+      .groupBy(selectColumns.map((col) => col.name));
   }
 
   // build where clause of the query

diff --git a/etl/app/server/database.js b/etl/app/server/database.js
@@ -76,17 +76,30 @@ function extractProfileName(name) {
     .replace('.csv', '');
 }
 
-async function cacheProfileStats(pool, schemaName, profileStats, s3Stats) {
+async function cacheProfileStats(
+  pool,
+  s3Config,
+  schemaName,
+  profileStats,
+  s3Stats,
+) {
   const client = await getClient(pool);
   try {
     await client.query('BEGIN');
     for (const profile of profileStats.details) {
       const profileName = extractProfileName(profile.name);
 
+      // verify table exists in tableConfig
+      const tableConfig = Object.values(s3Config.tableConfig).find(
+        (table) => table.tableName === profileName,
+      );
+      if (!tableConfig) continue;
+
       // lookup the file size from s3Stats
       const s3Metadata = s3Stats.files.find(
         (f) => extractProfileName(f.name) === profileName,
       );
+      if (!s3Metadata) continue;
 
       await client.query(
         'INSERT INTO logging.mv_profile_stats(profile_name, schema_name, num_rows, last_refresh_end_time, last_refresh_elapsed, csv_size, gz_size, zip_size, creation_date)' +
@@ -762,12 +775,23 @@ export async function runLoad(pool, s3Config, s3Julian, logId) {
       }),
     );
 
-    const profileStats = await getProfileStats(pool, schemaName, s3Julian);
+    const profileStats = await getProfileStats(
+      pool,
+      s3Config,
+      schemaName,
+      s3Julian,
+    );
 
     // Verify the etl was successfull and the data matches what we expect.
     // We skip this when running locally, since the row counts will never match.
     if (!isLocal) {
-      await certifyEtlComplete(pool, profileStats, schemaId, schemaName);
+      await certifyEtlComplete(
+        pool,
+        s3Config,
+        profileStats,
+        schemaId,
+        schemaName,
+      );
     }
 
     await transferSchema(pool, schemaName, schemaId);
@@ -781,7 +805,7 @@ export async function runLoad(pool, s3Config, s3Julian, logId) {
   }
 }
 
-async function getProfileStats(pool, schemaName, s3Julian) {
+async function getProfileStats(pool, s3Config, schemaName, s3Julian) {
   // get profile stats from s3
   const profileStats = await readS3File({
     bucketInfo: {
@@ -804,23 +828,42 @@ async function getProfileStats(pool, schemaName, s3Julian) {
     path: `national-downloads/${s3Julian}/status.json`,
   });
 
-  await cacheProfileStats(pool, schemaName, profileStats, s3Stats);
+  await cacheProfileStats(pool, s3Config, schemaName, profileStats, s3Stats);
 
   return profileStats.details;
 }
 
 // Verify the data pulled in from the ETL matches the materialized views.
-async function certifyEtlComplete(pool, profileStats, logId, schemaName) {
+async function certifyEtlComplete(
+  pool,
+  s3Config,
+  profileStats,
+  logId,
+  schemaName,
+) {
   // loop through and make sure the tables exist and the counts match
   let issuesMessage = '';
   for (const profile of profileStats) {
     const profileName = extractProfileName(profile.name);
 
+    // verify table exists in tableConfig
+    const tableConfig = Object.values(s3Config.tableConfig).find(
+      (table) => table.tableName === profileName,
+    );
+    if (!tableConfig) continue;
+
     // check date
     if (profile.last_refresh_end_time <= profile.last_refresh_date) {
       issuesMessage += `${profileName} issue: last_refresh_end_time (${profile.last_refresh_end_time}) is not after last_refresh_date (${profile.last_refresh_date}).\n`;
     }
 
+    // TODO: Remove this continuation once we get counts to align.
+    if (
+      process.env.SKIP_DOCUMENTS_TEXT_QA === 'true' &&
+      tableConfig.id === 'documentsText'
+    )
+      continue;
+
     // query to get row count
     const queryRes = await pool.query(
       `SELECT COUNT(*) FROM "${schemaName}"."${profileName}"`,