Skip to content

Commit

Permalink
implemented url screenshot extraction
Browse files Browse the repository at this point in the history
  • Loading branch information
emcf committed Mar 24, 2024
1 parent 4a3dd8b commit 4b9028c
Show file tree
Hide file tree
Showing 3 changed files with 16 additions and 10 deletions.
Binary file modified __pycache__/extract.cpython-310.pyc
Binary file not shown.
25 changes: 16 additions & 9 deletions extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
import json
import pytesseract
from unstructured.partition.auto import partition
from langchain_community.document_loaders import PlaywrightURLLoader
from playwright.sync_api import sync_playwright
import fitz
from core import Chunk, print_status, SourceTypes
import docx2txt
Expand Down Expand Up @@ -51,7 +51,7 @@ def extract_from_source(source_string: str, match: Optional[str] = None, ignore:
elif source_type == SourceTypes.ZIP:
return extract_zip(source_string=source_string, match=match, ignore=ignore, verbose=verbose, mathpix=mathpix, text_only=text_only)
elif source_type == SourceTypes.URL:
return [extract_url(source_string=source_string, text_only=text_only)]
return [extract_url(url=source_string, text_only=text_only)]
return extract_from_file(source_string=source_string, source_type=source_type, verbose=verbose, mathpix=mathpix, text_only=text_only)

def extract_from_file(source_string: str, source_type: str, verbose: bool = False, mathpix: bool = False, text_only: bool = False) -> List[str]:
Expand Down Expand Up @@ -217,13 +217,20 @@ def extract_spreadsheet(source_name: str) -> List[Chunk]:
return Chunk(path=source_name, text=json_dict, image=None, source_type=SourceTypes.SPREADSHEET)

def extract_url(url: str, text_only: bool = False) -> List[Chunk]:
loader = PlaywrightURLLoader(urls=[url])
data = loader.load()
text = "\n\n".join([str(el.page_content) for el in data])
if not text_only:
# TODO: Use playwright/cypress to extract image of page
pass
return Chunk(path=url, text=text, image=None, source_type=SourceTypes.URL)
img = None
text = None
with sync_playwright() as p:
for browser_type in [p.chromium, p.firefox, p.webkit]:
browser = browser_type.launch()
page = browser.new_page()
page.goto('https://scrapingant.com/')
screenshot = page.screenshot()
img = Image.open(BytesIO(screenshot))
text = page.inner_text('body')
browser.close()
if img is None and text is None:
raise Exception("Failed to extract from URL.")
return Chunk(path=url, text=text, image=img, source_type=SourceTypes.URL)

def extract_github(github_url: str, file_path: str = '', match: Optional[str] = None, ignore: Optional[str] = None, text_only: bool = False, mathpix: bool = False, branch: str = 'main', verbose: bool = False) -> List[Chunk]:
files_contents = []
Expand Down
1 change: 0 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@ playwright
docx2txt
python-pptx
Pyarrow
langchain-community
unstructured[all-docs]
poppler-utils
python-magic
Expand Down

0 comments on commit 4b9028c

Please sign in to comment.