one bucket (allenai#187)

naourass · Sep 8, 2022 · b950c13 · b950c13
1 parent 4761afc
commit b950c13
Show file tree

Hide file tree

Showing 2 changed files with 7 additions and 18 deletions.
diff --git a/scripts/ai2-internal/README.md b/scripts/ai2-internal/README.md
@@ -2,9 +2,9 @@
 #### Fetching PDFs (AI2 Internal)
 
 The `fetch_pdfs.py` script fetches S2 pdfs for use by PAWLS using paper shas.
-This requires access to private S2 pdf buckets, so is for internal use only. However,
+This requires access to the S2 pdf bucket, so is for internal use only. However,
 you can use PAWLS without using this script if you already have pdfs locally! This is simply
 a utility for S2 Researchers.
 
-The `fetch_pdfs.py` script requires a AWS key with read access to the S2 Pdf buckets. Your `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY` which you use for day-to-day AI2 work will
+The `fetch_pdfs.py` script requires a AWS key with read access to the S2 Pdf bucket. Your `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY` which you use for day-to-day AI2 work will
 be suitable - just make sure they are set as environment variables when running the PAWLS CLI.
diff --git a/scripts/ai2-internal/fetch_pdfs.py b/scripts/ai2-internal/fetch_pdfs.py
@@ -86,7 +86,7 @@ def fetch(path: click.Path, shas: Tuple[str], sha_file: click.Path = None):
 
 
 # settings for S3 buckets
-S3_BUCKET_PDFS = {"default": "ai2-s2-pdfs", "private": "ai2-s2-pdfs-private"}
+PDF_BUCKET_NAME = "ai2-s2-pdfs"
 
 
 def _per_dir_pdf_download(target_dir: str, sha: str):
@@ -124,34 +124,22 @@ def bulk_fetch_pdfs_for_s2_ids(
 
     os.makedirs(target_dir, exist_ok=True)
     s3 = boto3.resource("s3")
-    default_bucket = s3.Bucket(S3_BUCKET_PDFS["default"])
-    private_bucket = s3.Bucket(S3_BUCKET_PDFS["private"])
+    pdf_bucket = s3.Bucket(PDF_BUCKET_NAME)
 
     not_found = set()
     error = set()
     success = set()
     for s2_id in s2_ids:
         try:
-            default_bucket.download_file(
+            pdf_bucket.download_file(
                 os.path.join(s2_id[:4], f"{s2_id[4:]}.pdf"),
                 pdf_path_func(target_dir, s2_id),
             )
             success.add(s2_id)
 
         except botocore.exceptions.ClientError as e:
             if e.response["Error"]["Code"] == "404":
-                try:
-                    private_bucket.download_file(
-                        os.path.join(s2_id[:4], f"{s2_id[4:]}.pdf"),
-                        pdf_path_func(target_dir, s2_id),
-                    )
-                    success.add(s2_id)
-
-                except botocore.exceptions.ClientError as e:
-                    if e.response["Error"]["Code"] == "404":
-                        not_found.add(s2_id)
-                    else:
-                        error.add(s2_id)
+                not_found.add(s2_id)
             else:
                 error.add(s2_id)
 
@@ -179,6 +167,7 @@ def get_paper_title(paper_sha: str) -> Optional[str]:
     else:
         return None
 
+
 if __name__ == "__main__":
 
     fetch()