Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Document parsing module #3

Draft
wants to merge 24 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
44f0134
add module to parse pdf to markdown
du-phan Jan 22, 2025
be2e189
add cli command for the pdf parsing
du-phan Jan 22, 2025
0465148
update dependencies
du-phan Jan 22, 2025
e6f3f92
add parsed documents
du-phan Jan 22, 2025
b18a956
add extractor using mistral
du-phan Jan 23, 2025
3a9f98c
update github action
du-phan Jan 23, 2025
7453bd4
add cli for extractor
du-phan Jan 23, 2025
65e07d8
add tool to populate supabase
du-phan Jan 24, 2025
0297b05
add functions to evaluate feedstock sustainability
du-phan Jan 25, 2025
857aba1
cleaner logging
du-phan Jan 25, 2025
f8efebf
check if file exists first before doing anything
du-phan Jan 25, 2025
4e8a474
add cli command to populate supabase with feedstock evaluation
du-phan Jan 26, 2025
1c320c9
updated data
du-phan Jan 26, 2025
7702e58
add function to parse the whole folder at once
du-phan Feb 3, 2025
a2470e4
add sequestrae due diligence criteria
du-phan Feb 3, 2025
69a9ba5
add cli command for due diligence analyzer
du-phan Feb 4, 2025
b3d8884
move from llama to gemini
du-phan Feb 7, 2025
9c96df4
Removed project_data directory from repository tracking
du-phan Feb 7, 2025
cdbd2d1
add hallucination detection promp
du-phan Feb 10, 2025
c96ba5a
add function to validate and fix the json array
du-phan Feb 10, 2025
9014f9b
fix an error in the fix_hallucination_recursive function that leads t…
du-phan Feb 10, 2025
a97343b
update .gitignore
du-phan Feb 10, 2025
77795c1
fix minor error with the value to return in fix_hallucination_recursive
du-phan Feb 10, 2025
f18c6cc
add retry on error decorator
du-phan Feb 11, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ jobs:

# Step 7: Upload Coverage Report as an Artifact (optional)
- name: Upload Coverage Report
uses: actions/upload-artifact@v3
uses: actions/upload-artifact@v4
with:
name: coverage-report
path: htmlcov/
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
project_data/
.python-version
.vscode/
.DS_Store
Expand Down
2,628 changes: 2,291 additions & 337 deletions poetry.lock

Large diffs are not rendered by default.

18 changes: 16 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,21 +1,35 @@
[tool.poetry.scripts]
sequestrae = "sequestrae_engine.cli.main:main"

[tool.poetry]
name = "sequestrae-engine"
version = "0.1.0"
description = "A Biochar Carbon Removal Assessment Engine for evaluating carbon sequestration potential."
description = "A Biochar Carbon Removal Assessment Engine for evaluating project risk."
authors = ["Du Phan <[email protected]>"]
license = "MIT"
readme = "README.md"
homepage = "https://github.com/du-phan/sequestrae-engine"
repository = "https://github.com/du-phan/sequestrae-engine"
keywords = ["biochar", "carbon removal", "assessment", "engine", "sequestration"]
packages = [
{ include = "sequestrae_engine" }
]

[tool.poetry.dependencies]
python = ">=3.10,<4.0"
numpy = ">=1.24,<2.0"
pandas = "^2.0"
pydantic = "^1.10.2"
pydantic = "^2.5.2"
jsonschema = "^4.16.0"
jupyterlab = "^4.0.5"
llama-index-core = "^0.10.0"
llama-parse = "^0.3.3"
llama-index-readers-file = "^0.1.6"
python-dotenv = "^1.0.0"
openai = "^1.0.0"
mistralai = "^1.3"
google-genai = "^1.0.0"
supabase = "^2.10.0"

[tool.poetry.dev-dependencies]
black = "24.10.0"
Expand Down
Empty file.
216 changes: 216 additions & 0 deletions sequestrae_engine/cli/commands.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,216 @@
import logging
import os
import time
from functools import wraps
from pathlib import Path
from time import sleep

from sequestrae_engine.db.client import SupabaseClient
from sequestrae_engine.db.scripts.populate_audit_reports import populate_feedstock_evaluation_table
from sequestrae_engine.document_parsing.extractors import AuditReportExtractor
from sequestrae_engine.document_parsing.parser import PDFToMarkdownParser

# Configure logger
logging.basicConfig(
level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
)
logger = logging.getLogger(__name__)


def retry_on_error(max_retries=3, delay=5):
def decorator(func):
@wraps(func)
def wrapper(*args, **kwargs):
retries = 0
while retries < max_retries:
try:
return func(*args, **kwargs)
except Exception as e:
retries += 1
if retries == max_retries:
raise e
logger.warning(
f"Error occurred: {str(e)}. Retrying in {delay} seconds... (Attempt {retries}/{max_retries})"
)
sleep(delay)
return None

return wrapper

return decorator


def parse_pdfs_command(api_key, project_dir, limit=5):
if limit is None:
limit = 5
logger.info(f"Max number of files to process: {limit}")

if not api_key:
logger.error("LLAMA_API_KEY is required")
return 1

parser = PDFToMarkdownParser(api_key=api_key)
project_path = Path(project_dir)

if not project_path.exists():
logger.error(f"Directory not found at {project_path}")
return 1

pdf_count = 0
for folder_path in project_path.iterdir():
if not folder_path.name.startswith(".") and folder_path.is_dir() and pdf_count < limit:
for pdf_path in folder_path.glob("*.pdf"):
if pdf_path.is_file() and "report" in pdf_path.stem.lower():
pdf_count += 1
parser.parse_pdf(pdf_path)

logger.info(f"Successfully processed {pdf_count} PDF files")
return 0


def extract_audit_information_command(api_key, project_dir, limit=100):
if limit is None:
limit = 100
logger.info(f"Max number of files to process: {limit}")

if not api_key:
logger.error("MISTRAL_API_KEY is required")
return 1

extractor = AuditReportExtractor(mistral_api_key=api_key)
project_path = Path(project_dir)

if not project_path.exists():
logger.error(f"Directory not found at {project_path}")
return 1

markdown_count = 0
for folder_path in project_path.iterdir():
if not folder_path.name.startswith(".") and folder_path.is_dir() and markdown_count < limit:
for md_path in folder_path.glob("*.md"):
if md_path.is_file() and "report" in md_path.stem.lower():
markdown_count += 1
try:
retry_on_error()(extractor.parse_audit_report)(audit_report_path=md_path)
except Exception as e:
logger.error(f"Error processing {md_path}: {str(e)}")
time.sleep(1) # Sleep for 1 second to avoid rate limiting

logger.info(f"Successfully processed {markdown_count} markdown files")
return 0


def evaluate_feedstock_sustainability_command(api_key, project_dir, limit=100):
if limit is None:
limit = 100
logger.info(f"Max number of files to process: {limit}")

if not api_key:
logger.error("MISTRAL_API_KEY is required")
return 1

extractor = AuditReportExtractor(mistral_api_key=api_key)
project_path = Path(project_dir)

if not project_path.exists():
logger.error(f"Directory not found at {project_path}")
return 1

markdown_count = 0
for folder_path in project_path.iterdir():
if not folder_path.name.startswith(".") and folder_path.is_dir() and markdown_count < limit:
for md_path in folder_path.glob("*.md"):
if md_path.is_file() and "report" in md_path.stem.lower():
markdown_count += 1
try:
retry_on_error()(extractor.analyze_feedstock_sustainability)(
audit_path=md_path
)
except Exception as e:
logger.error(f"Error processing {md_path}: {str(e)}")
time.sleep(1) # Sleep for 1 second to avoid rate limiting

logger.info(f"Successfully processed {markdown_count} markdown files")
return 0


def populate_feedstock_evaluation_command(supabase_url, supabase_api_key, project_dir):
if not supabase_url or not supabase_api_key:
logger.error("Supabase URL and api key are required")
return 1

try:
supabase_client = SupabaseClient.get_client(supabase_url, supabase_api_key)
populate_feedstock_evaluation_table(project_dir, supabase_client)
logger.info("Successfully populated feedstock evaluation table")
return 0
except Exception as e:
logger.error(f"Error populating feedstock evaluation table: {str(e)}")
return 1


def analyze_due_diligence_command(gemini_api_key, mistral_api_key, project_dir):
"""
Process all PDFs in project subfolders and analyze due diligence criteria.

Args:
gemini_api_key: API key for PDF parsing
mistral_api_key: API key for analysis
project_dir: Root directory containing project folders
"""
# max_num_folder = 60

if not gemini_api_key or not mistral_api_key:
logger.error("Both GEMINI_API_KEY and MISTRAL_API_KEY are required")
return 1

project_path = Path(project_dir)
if not project_path.exists():
logger.error(f"Directory not found at {project_path}")
return 1

pdf_parser = PDFToMarkdownParser(gemini_api_key=gemini_api_key)
audit_extractor = AuditReportExtractor(mistral_api_key=mistral_api_key)

# Count total subfolders (excluding hidden folders)
total_folders = sum(
1
for folder in project_path.iterdir()
if folder.is_dir() and not folder.name.startswith(".")
)

logger.info(f"Found {total_folders} project folders to process")

processed_folder = 1
for folder_path in project_path.iterdir():
# if processed_folder >= max_num_folder:
# break

if not folder_path.name.startswith(".") and folder_path.is_dir():
project_name = "_".join(folder_path.name.split())
logger.info(f"Analyzing project {processed_folder}/{total_folders}: {project_name} ...")

# Parse PDFs in folder
start_time = time.time()
pdf_parser.parse_pdf_folder(folder_path, overwrite=False)
logger.info(
f"--------- PDF processing completed in {round((time.time() - start_time)/60, 2)} minutes ---------"
)

# Analyze due diligence
markdown_document_path = os.path.join(
folder_path, "parsed_markdown", f"concatenated_documentation_{pdf_parser.model}.md"
)
start_time = time.time()
retry_on_error()(audit_extractor.analyze_due_diligence_criteria)(
project_name=project_name, markdown_document_path=markdown_document_path
)
logger.info(
f"--------- Due diligence analysis complete in {round((time.time() - start_time)/60, 2)} minutes."
)
processed_folder += 1

time.sleep(1)

logger.info("Completed processing all projects")
return 0
104 changes: 104 additions & 0 deletions sequestrae_engine/cli/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
import argparse
import os
import sys

from . import commands


def main():
parser = argparse.ArgumentParser(description="Sequestrae Engine CLI")
subparsers = parser.add_subparsers(dest="command")

# Parse PDFs command
parse_pdfs_parser = subparsers.add_parser(
"parse-pdf", help="Parse PDF audit report files to Markdown"
)
parse_pdfs_parser.add_argument("--llama-api-key", help="Llama API key", required=True)
parse_pdfs_parser.add_argument("--project-dir", help="Project data directory", required=True)
parse_pdfs_parser.add_argument("--limit", help="Maximum number of PDFs to process", type=int)
parse_pdfs_parser.set_defaults(
func=lambda args: commands.parse_pdfs_command(
args.llama_api_key, args.project_dir, args.limit
)
)

# Extract audit information command
extract_audit_parser = subparsers.add_parser(
"extract-audit", help="Extract audit information from markdown report files"
)
extract_audit_parser.add_argument("--mistral-api-key", help="Mistral API key", required=True)
extract_audit_parser.add_argument("--project-dir", help="Project data directory", required=True)
extract_audit_parser.add_argument(
"--limit", help="Maximum number of files to process", type=int
)
extract_audit_parser.set_defaults(
func=lambda args: commands.extract_audit_information_command(
args.mistral_api_key, args.project_dir, args.limit
)
)

# Evaluate feedstock sustainability command
evaluate_feedstock_parser = subparsers.add_parser(
"evaluate-feedstock", help="Evaluate feedstock sustainability from markdown report files"
)
evaluate_feedstock_parser.add_argument(
"--mistral-api-key", help="Mistral API key", required=True
)
evaluate_feedstock_parser.add_argument(
"--project-dir", help="Project data directory", required=True
)
evaluate_feedstock_parser.add_argument(
"--limit", help="Maximum number of files to process", type=int
)
evaluate_feedstock_parser.set_defaults(
func=lambda args: commands.evaluate_feedstock_sustainability_command(
args.mistral_api_key, args.project_dir, args.limit
)
)

# Populate feedstock evaluation table command
populate_feedstock_parser = subparsers.add_parser(
"populate-feedstock-table", help="Populate Supabase with feedstock evaluation data"
)
populate_feedstock_parser.add_argument("--supabase-url", help="Supabase URL", required=True)
populate_feedstock_parser.add_argument("--supabase-api-key", help="Supabase key", required=True)
populate_feedstock_parser.add_argument(
"--project-dir", help="Project data directory", required=True
)
populate_feedstock_parser.set_defaults(
func=lambda args: commands.populate_feedstock_evaluation_command(
args.supabase_url, args.supabase_api_key, args.project_dir
)
)

# Due diligence analyze command
process_analyze_parser = subparsers.add_parser(
"due-diligence-analyze",
help="Process PDFs and analyze due diligence criteria for all projects",
)
process_analyze_parser.add_argument(
"--gemini-api-key", help="Gemini API key for PDF parsing", required=True
)
process_analyze_parser.add_argument(
"--mistral-api-key", help="Mistral API key for analysis", required=True
)
process_analyze_parser.add_argument(
"--project-dir", help="Project data directory", required=True
)
process_analyze_parser.set_defaults(
func=lambda args: commands.analyze_due_diligence_command(
args.gemini_api_key, args.mistral_api_key, args.project_dir
)
)

args = parser.parse_args()
if hasattr(args, "func"):
exit_status = args.func(args)
else:
parser.print_help()
exit_status = 1
return exit_status


if __name__ == "__main__":
main()
Loading
Loading