allenai · egork520 · Mar 26, 2022 · Mar 28, 2022 · Mar 29, 2022 · Mar 29, 2022
diff --git a/.gitignore b/.gitignore
@@ -18,7 +18,7 @@ ui/*.log
 # Ignore any files in ./skiff_files
 skiff_files/*
 
-### Python Build Related ### 
+### Python Build Related ###
 
 # Byte-compiled / optimized / DLL files
 __pycache__/
@@ -157,4 +157,11 @@ dmypy.json
 .pytype/
 
 # Cython debug symbols
-cython_debug/
+cython_debug/
+
+# macOS
+.DS_Store
+node_modules/
+.vscode
+/tmp
+ui/package-lock.json
diff --git a/.skiff/cloudbuild-deploy.yaml b/.skiff/cloudbuild-deploy.yaml
@@ -137,4 +137,4 @@ artifacts:
     location: 'gs://skiff-archive/$REPO_NAME/$_ENV/$BUILD_ID/$COMMIT_SHA'
     paths: ['.skiff/webapp.yaml']
 
-timeout: 900s
+timeout: 3600s
diff --git a/.skiff/webapp.jsonnet b/.skiff/webapp.jsonnet
@@ -24,16 +24,6 @@ function(
             config.appName + '.' + env + topLevelDomain
     ];
 
-    // In production we run two versions of your application, as to ensure that
-    // if one instance goes down or is busy, end users can still use the application.
-    // In all other environments we run a single instance to save money.
-    local replicas = (
-        if env == 'prod' then
-            2
-        else
-            1
-    );
-
     // Each app gets it's own namespace.
     local namespaceName = config.appName;
 
@@ -122,7 +112,9 @@ function(
                 'nginx.ingress.kubernetes.io/ssl-redirect': 'true',
                 'nginx.ingress.kubernetes.io/auth-url': 'https://google.login.apps.allenai.org/oauth2/auth',
                 'nginx.ingress.kubernetes.io/auth-signin': 'https://google.login.apps.allenai.org/oauth2/start?rd=https://$host$request_uri',
-                'nginx.ingress.kubernetes.io/auth-response-headers': 'X-Auth-Request-User, X-Auth-Request-Email'
+                'nginx.ingress.kubernetes.io/auth-response-headers': 'X-Auth-Request-User, X-Auth-Request-Email',
+                'nginx.ingress.kubernetes.io/proxy-read-timeout': '300',
+                'nginx.ingress.kubernetes.io/proxy-body-size': '50m'
             }
         },
         spec: {
@@ -163,7 +155,7 @@ function(
         },
         spec: {
             revisionHistoryLimit: 3,
-            replicas: replicas,
+            replicas: config.replicas,
             selector: {
                 matchLabels: selectorLabels
             },
@@ -215,7 +207,27 @@ function(
                         {
                             name: fullyQualifiedName + '-api',
                             image: apiImage,
-                            env: [ { name: "IN_PRODUCTION", value: "prod" }],
+                            env: [
+                                { name: "IN_PRODUCTION", value: "prod" },
+                                {
+                                    name: "AWS_ACCESS_KEY_ID",
+                                    valueFrom: {
+                                        secretKeyRef: {
+                                            name: "aws-pdf-iam",
+                                            key: "AWS_ACCESS_KEY_ID"
+                                        }
+                                    }
+                                },
+                                {
+                                    name: "AWS_SECRET_ACCESS_KEY",
+                                    valueFrom: {
+                                        secretKeyRef: {
+                                            name: "aws-pdf-iam",
+                                            key: "AWS_SECRET_ACCESS_KEY"
+                                        }
+                                    }
+                                },
+                            ],
                             volumeMounts: [
                                 {
                                     mountPath: '/skiff_files/apps/pawls',
@@ -265,14 +277,6 @@ function(
                                 periodSeconds: 10,
                                 failureThreshold: 3
                             },
-                            livenessProbe: {
-                                httpGet: apiHealthCheck + {
-                                    path: '/?check=liveness_probe'
-                                },
-                                periodSeconds: 10,
-                                failureThreshold: 9,
-                                initialDelaySeconds: 30
-                            },
                             # This tells Kubernetes what CPU and memory resources your API needs.
                             # We set these values low by default, as most applications receive
                             # bursts of activity and accordingly don't need dedicated resources
@@ -310,16 +314,10 @@ function(
                                     path: '/?check=rdy'
                                 }
                             },
-                            livenessProbe: {
-                                failureThreshold: 6,
-                                httpGet: proxyHealthCheck + {
-                                    path: '/?check=live'
-                                }
-                            },
                             resources: {
                                 requests: {
-                                   cpu: '50m',
-                                   memory: '100Mi'
+                                   cpu: '500m',
+                                   memory: '500Mi'
                                 }
                             }
                         }

diff --git a/README.md b/README.md
@@ -53,7 +53,11 @@ For instance, you can run the following commands to download, preprocess, and as
   pawls assign skiff_files/apps/pawls/papers [email protected] --all --name-file skiff_files/apps/pawls/papers/name_mapping.json
 ```
 
-and then open up the UI locally by running `docker-compose up`.
+#### Getting annotation files to s3
+PDFs and assignment files from status folder need to be copied to s3 bucket `output_directory` specified in `api/config/configuration.json`
+Annotations are going to be uploaded to the `output_directory`.
+
+And then open up the UI locally by running `docker-compose up`.
 
 ### Authentication and Authorization
 
@@ -203,3 +207,42 @@ If you find PAWLS helpful for your research, please consider cite PAWLS.
 ---
 
 PAWLS is an open-source project developed by [the Allen Institute for Artificial Intelligence (AI2)](http://www.allenai.org). AI2 is a non-profit institute with the mission to contribute to humanity through high-impact AI research and engineering.
+
+## Replica Management
+
+Because the application is used in short bursts for annotation projects, we manually turn
+the application on and off. We do this by managing the number or replicas, toggling it from
+`0` to `1` and vice versa.
+
+To adjust the number of replicas, edit the `skiff.json` and change the replica
+count. For instance, you can turn the application "off" like so:
+
+```diff
+{
+    "appName": "pawls",
+    "contact": "lucas",
+    "team": "s2research",
+-    "replicas": 1
++    "replicas": 0
+}
+```
+
+...and turn it back "on" by reversing that change:
+
+```diff
+{
+    "appName": "pawls",
+    "contact": "lucas",
+    "team": "s2research",
+-    "replicas": 0
++    "replicas": 1
+}
+```
+
+The change will be applied after committing and pushing your change. It usually
+takes around 5 minutes or so for things to take effect.
+
+You can confirm the change by visiting [Marina](https://marina.apps.allenai.org/a/pawls)
+and inspecting the "Replicas" list for the `skimming-annotations` environment. 
+The number of replicas displayed there should match match the value in `skiff.json`.
+
diff --git a/api/Dockerfile b/api/Dockerfile
@@ -1,4 +1,5 @@
-FROM python:3.7.2
+FROM python:3.8.12
+
 
 # Setup a spot for the api code
 WORKDIR /usr/local/src/skiff/app/api
@@ -11,6 +12,7 @@ COPY requirements.txt .
 
 RUN pip install -r requirements.txt
 
+########## COPYING SOURCE CODE FROM HERE ON ##########
 
 # Copy over the source code
 COPY app app/
@@ -19,4 +21,4 @@ COPY main.py main.py
 
 # Kick things off
 ENTRYPOINT [ "uvicorn" ]
-CMD ["main:app", "--host", "0.0.0.0"]
+CMD ["main:app", "--host", "0.0.0.0"]
diff --git a/api/app/annotations.py b/api/app/annotations.py
@@ -1,5 +1,5 @@
 from typing import Optional, List
-from pydantic import BaseModel
+from pydantic import BaseModel, Field, validator
 
 
 class Bounds(BaseModel):
@@ -36,3 +36,22 @@ class RelationGroup(BaseModel):
 class PdfAnnotation(BaseModel):
     annotations: List[Annotation]
     relations: List[RelationGroup]
+
+
+class PageSpec(BaseModel):
+    width: int
+    height: int
+    index: int
+
+
+class PageToken(BaseModel):
+    text: str
+    width: float
+    height: float
+    x: float
+    y: float
+
+
+class Page(BaseModel):
+    page: PageSpec
+    tokens: List[PageToken] = Field(default_factory=lambda: [])
diff --git a/api/app/pdfplumber.py b/api/app/pdfplumber.py
@@ -0,0 +1,126 @@
+from typing import List
+
+import pandas as pd
+import pdfplumber
+
+from .annotations import PageToken, Page
+
+import json
+import logging
+from pathlib import Path
+from typing import Union
+
+
+logger = logging.getLogger("uvicorn")
+
+
+class PDFPlumberTokenExtractor:
+
+    @staticmethod
+    def convert_to_pagetoken(row: pd.Series) -> Page:
+        """Convert a row in a DataFrame to pagetoken"""
+        return dict(
+            text=row["text"],
+            x=row["x0"],
+            width=row["width"],
+            y=row["top"],
+            height=row["height"],
+        )
+
+    def extract(self, pdf_path: str) -> List[Page]:
+        """Extracts token text, positions, and style information from a PDF file.
+
+        Args:
+            pdf_path (str): the path to the pdf file.
+            include_lines (bool, optional): Whether to include line tokens. Defaults to False.
+
+        Returns:
+            PdfAnnotations: A `PdfAnnotations` containing all the paper token information.
+        """
+        plumber_pdf_object = pdfplumber.open(pdf_path)
+
+        pages = []
+        for page_id in range(len(plumber_pdf_object.pages)):
+            cur_page = plumber_pdf_object.pages[page_id]
+
+            tokens = self.obtain_word_tokens(cur_page)
+
+            page = dict(
+                page=dict(
+                    width=float(cur_page.width),
+                    height=float(cur_page.height),
+                    index=page_id
+                ),
+                tokens=tokens
+            )
+            pages.append(page)
+
+        return pages
+
+    def obtain_word_tokens(self, cur_page: pdfplumber.page.Page) -> List[PageToken]:
+        """Obtain all words from the current page.
+        Args:
+            cur_page (pdfplumber.page.Page):
+                the pdfplumber.page.Page object with PDF token information
+
+        Returns:
+            List[PageToken]:
+                A list of page tokens stored in PageToken format.
+        """
+        words = cur_page.extract_words(
+            x_tolerance=1.5,
+            y_tolerance=3,
+            keep_blank_chars=False,
+            use_text_flow=True,
+            horizontal_ltr=True,
+            vertical_ttb=True,
+            extra_attrs=["fontname", "size"],
+        )
+        if len(words) == 0:
+            return []
+
+        df = pd.DataFrame(words)
+
+        # Avoid boxes outside the page
+        df[["x0", "x1"]] = df[["x0", "x1"]].\
+            clip(lower=0, upper=int(cur_page.width)).\
+                astype("float")
+
+        df[["top", "bottom"]] = df[["top", "bottom"]].\
+            clip(lower=0, upper=int(cur_page.height)).\
+                astype("float")
+
+        df["height"] = df["bottom"] - df["top"]
+        df["width"] = df["x1"] - df["x0"]
+
+        word_tokens = df.apply(self.convert_to_pagetoken, axis=1).tolist()
+        return word_tokens
+
+
+
+def process_pdfplumber(file_path: Union[str, Path]) -> Path:
+    """
+    Run a pre-processor on a pdf/directory of pawls pdfs and
+    write the resulting token information to the pdf location.
+    """
+    file_path = Path(file_path)
+
+    if not file_path.exists():
+        msg = f'Cannot find {file_path}'
+        raise ValueError(msg)
+
+    structure_path = file_path.parent / "pdf_structure.json"
+
+    if not structure_path.exists():
+
+        logging.info(f"Processing {file_path} using pdfplumber...")
+
+        pdf_extractors = PDFPlumberTokenExtractor()
+        data = pdf_extractors.extract(file_path)
+
+        with open(structure_path, mode="w+", encoding='utf-8') as f:
+            json.dump(data, f)
+    else:
+        logging.warn(f"Parsed {structure_path} exists, skipping...")
+
+    return structure_path