Skip to content

Commit

Permalink
add cli command for the pdf parsing
Browse files Browse the repository at this point in the history
  • Loading branch information
du-phan committed Jan 22, 2025
1 parent 44f0134 commit be2e189
Show file tree
Hide file tree
Showing 3 changed files with 67 additions and 0 deletions.
Empty file.
32 changes: 32 additions & 0 deletions sequestrae_engine/cli/commands.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
import os
from pathlib import Path

from sequestrae_engine.document_parsing.parser import PDFToMarkdownParser


def parse_pdfs_command(api_key, project_dir, limit=5):
if limit is None:
limit = 5
print(f"Max number of files to process: {limit}")

if not api_key:
print("Error: LLAMA_API_KEY is required")
return 1

parser = PDFToMarkdownParser(api_key=api_key)
project_path = Path(project_dir)

if not project_path.exists():
print(f"Error: Directory not found at {project_path}")
return 1

pdf_count = 0
for folder_path in project_path.iterdir():
if not folder_path.name.startswith(".") and folder_path.is_dir() and pdf_count < limit:
for pdf_path in folder_path.glob("*.pdf"):
if pdf_path.is_file() and "report" in pdf_path.stem.lower():
pdf_count += 1
parser.parse_pdf(pdf_path)

print(f"Successfully processed {pdf_count} PDF files")
return 0
35 changes: 35 additions & 0 deletions sequestrae_engine/cli/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
import argparse
import os
import sys

from . import commands


def main():
parser = argparse.ArgumentParser(description="Sequestrae Engine CLI")
subparsers = parser.add_subparsers(dest="command")

# Parse PDFs command
parse_pdfs_parser = subparsers.add_parser(
"parse-pdf", help="Parse PDF audit report files to Markdown"
)
parse_pdfs_parser.add_argument(
"--api-key", help="Llama API key", default=os.environ.get("LLAMA_API_KEY")
)
parse_pdfs_parser.add_argument("--project-dir", help="Project data directory", required=True)
parse_pdfs_parser.add_argument("--limit", help="Maximum number of PDFs to process", type=int)
parse_pdfs_parser.set_defaults(
func=lambda args: commands.parse_pdfs_command(args.api_key, args.project_dir, args.limit)
)

args = parser.parse_args()
if hasattr(args, "func"):
exit_status = args.func(args)
else:
parser.print_help()
exit_status = 1
return exit_status


if __name__ == "__main__":
main()

0 comments on commit be2e189

Please sign in to comment.