Skip to content

Commit

Permalink
one bucket (allenai#187)
Browse files Browse the repository at this point in the history
  • Loading branch information
geli-gel authored Sep 8, 2022
1 parent 4761afc commit b950c13
Show file tree
Hide file tree
Showing 2 changed files with 7 additions and 18 deletions.
4 changes: 2 additions & 2 deletions scripts/ai2-internal/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@
#### Fetching PDFs (AI2 Internal)

The `fetch_pdfs.py` script fetches S2 pdfs for use by PAWLS using paper shas.
This requires access to private S2 pdf buckets, so is for internal use only. However,
This requires access to the S2 pdf bucket, so is for internal use only. However,
you can use PAWLS without using this script if you already have pdfs locally! This is simply
a utility for S2 Researchers.

The `fetch_pdfs.py` script requires a AWS key with read access to the S2 Pdf buckets. Your `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY` which you use for day-to-day AI2 work will
The `fetch_pdfs.py` script requires a AWS key with read access to the S2 Pdf bucket. Your `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY` which you use for day-to-day AI2 work will
be suitable - just make sure they are set as environment variables when running the PAWLS CLI.
21 changes: 5 additions & 16 deletions scripts/ai2-internal/fetch_pdfs.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ def fetch(path: click.Path, shas: Tuple[str], sha_file: click.Path = None):


# settings for S3 buckets
S3_BUCKET_PDFS = {"default": "ai2-s2-pdfs", "private": "ai2-s2-pdfs-private"}
PDF_BUCKET_NAME = "ai2-s2-pdfs"


def _per_dir_pdf_download(target_dir: str, sha: str):
Expand Down Expand Up @@ -124,34 +124,22 @@ def bulk_fetch_pdfs_for_s2_ids(

os.makedirs(target_dir, exist_ok=True)
s3 = boto3.resource("s3")
default_bucket = s3.Bucket(S3_BUCKET_PDFS["default"])
private_bucket = s3.Bucket(S3_BUCKET_PDFS["private"])
pdf_bucket = s3.Bucket(PDF_BUCKET_NAME)

not_found = set()
error = set()
success = set()
for s2_id in s2_ids:
try:
default_bucket.download_file(
pdf_bucket.download_file(
os.path.join(s2_id[:4], f"{s2_id[4:]}.pdf"),
pdf_path_func(target_dir, s2_id),
)
success.add(s2_id)

except botocore.exceptions.ClientError as e:
if e.response["Error"]["Code"] == "404":
try:
private_bucket.download_file(
os.path.join(s2_id[:4], f"{s2_id[4:]}.pdf"),
pdf_path_func(target_dir, s2_id),
)
success.add(s2_id)

except botocore.exceptions.ClientError as e:
if e.response["Error"]["Code"] == "404":
not_found.add(s2_id)
else:
error.add(s2_id)
not_found.add(s2_id)
else:
error.add(s2_id)

Expand Down Expand Up @@ -179,6 +167,7 @@ def get_paper_title(paper_sha: str) -> Optional[str]:
else:
return None


if __name__ == "__main__":

fetch()

0 comments on commit b950c13

Please sign in to comment.