From b950c1346bd3c6a52ee96509d273ef9e37315695 Mon Sep 17 00:00:00 2001 From: Angele Zamarron Date: Wed, 7 Sep 2022 17:40:50 -0700 Subject: [PATCH] one bucket (#187) --- scripts/ai2-internal/README.md | 4 ++-- scripts/ai2-internal/fetch_pdfs.py | 21 +++++---------------- 2 files changed, 7 insertions(+), 18 deletions(-) diff --git a/scripts/ai2-internal/README.md b/scripts/ai2-internal/README.md index 5c5e4693..0ea6ccf5 100644 --- a/scripts/ai2-internal/README.md +++ b/scripts/ai2-internal/README.md @@ -2,9 +2,9 @@ #### Fetching PDFs (AI2 Internal) The `fetch_pdfs.py` script fetches S2 pdfs for use by PAWLS using paper shas. -This requires access to private S2 pdf buckets, so is for internal use only. However, +This requires access to the S2 pdf bucket, so is for internal use only. However, you can use PAWLS without using this script if you already have pdfs locally! This is simply a utility for S2 Researchers. -The `fetch_pdfs.py` script requires a AWS key with read access to the S2 Pdf buckets. Your `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY` which you use for day-to-day AI2 work will +The `fetch_pdfs.py` script requires a AWS key with read access to the S2 Pdf bucket. Your `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY` which you use for day-to-day AI2 work will be suitable - just make sure they are set as environment variables when running the PAWLS CLI. diff --git a/scripts/ai2-internal/fetch_pdfs.py b/scripts/ai2-internal/fetch_pdfs.py index 5ecedbbb..eda9a1d4 100644 --- a/scripts/ai2-internal/fetch_pdfs.py +++ b/scripts/ai2-internal/fetch_pdfs.py @@ -86,7 +86,7 @@ def fetch(path: click.Path, shas: Tuple[str], sha_file: click.Path = None): # settings for S3 buckets -S3_BUCKET_PDFS = {"default": "ai2-s2-pdfs", "private": "ai2-s2-pdfs-private"} +PDF_BUCKET_NAME = "ai2-s2-pdfs" def _per_dir_pdf_download(target_dir: str, sha: str): @@ -124,15 +124,14 @@ def bulk_fetch_pdfs_for_s2_ids( os.makedirs(target_dir, exist_ok=True) s3 = boto3.resource("s3") - default_bucket = s3.Bucket(S3_BUCKET_PDFS["default"]) - private_bucket = s3.Bucket(S3_BUCKET_PDFS["private"]) + pdf_bucket = s3.Bucket(PDF_BUCKET_NAME) not_found = set() error = set() success = set() for s2_id in s2_ids: try: - default_bucket.download_file( + pdf_bucket.download_file( os.path.join(s2_id[:4], f"{s2_id[4:]}.pdf"), pdf_path_func(target_dir, s2_id), ) @@ -140,18 +139,7 @@ def bulk_fetch_pdfs_for_s2_ids( except botocore.exceptions.ClientError as e: if e.response["Error"]["Code"] == "404": - try: - private_bucket.download_file( - os.path.join(s2_id[:4], f"{s2_id[4:]}.pdf"), - pdf_path_func(target_dir, s2_id), - ) - success.add(s2_id) - - except botocore.exceptions.ClientError as e: - if e.response["Error"]["Code"] == "404": - not_found.add(s2_id) - else: - error.add(s2_id) + not_found.add(s2_id) else: error.add(s2_id) @@ -179,6 +167,7 @@ def get_paper_title(paper_sha: str) -> Optional[str]: else: return None + if __name__ == "__main__": fetch()