Skip to content

Commit

Permalink
set default limit to None
Browse files Browse the repository at this point in the history
  • Loading branch information
emcf committed Apr 16, 2024
1 parent f4d7cd8 commit 0d6523a
Show file tree
Hide file tree
Showing 3 changed files with 7 additions and 7 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ thepipe path/to/folder

## How it works 🛠️

The pipe is accessible from the command line or from [Python](https://www.python.org/downloads/). The input source is either a file path, a URL, or a directory (or zip file) path. The pipe will extract information from the source and process it for downstream use with [language models](https://en.wikipedia.org/wiki/Large_language_model), [vision transformers](https://en.wikipedia.org/wiki/Vision_transformer), or [vision-language models](https://arxiv.org/abs/2304.00685). The output from the pipe is a sensible text-based (or multimodal) representation of the extracted information, carefully crafted to fit within context windows for any models from [gemma-7b](https://huggingface.co/google/gemma-7b) to [GPT-4](https://openai.com/gpt-4). It uses a variety of heuristics for optimal performance with vision-language models, including AI filetype detection with [filetype detection](https://opensource.googleblog.com/2024/02/magika-ai-powered-fast-and-efficient-file-type-identification.html), AI [PDF extraction](thepi.pe/docs), efficient [token compression](https://arxiv.org/abs/2403.12968), automatic [image encoding](https://en.wikipedia.org/wiki/Base64), [reranking](https://arxiv.org/abs/2310.06839) for [lost-in-the-middle](https://arxiv.org/abs/2307.03172) effects, and more, all pre-built to work out-of-the-box.
The pipe is accessible from the command line or from [Python](https://www.python.org/downloads/). The input source is either a file path, a URL, or a directory. The pipe will extract information from the source and process it for downstream use with [language models](https://en.wikipedia.org/wiki/Large_language_model), [vision transformers](https://en.wikipedia.org/wiki/Vision_transformer), or [vision-language models](https://arxiv.org/abs/2304.00685). The output from the pipe is a sensible text-based (or multimodal) representation of the extracted information, carefully crafted to fit within context windows for any models from [gemma-7b](https://huggingface.co/google/gemma-7b) to [GPT-4](https://openai.com/gpt-4). It uses a variety of heuristics for optimal performance with vision-language models, including AI filetype detection with [filetype detection](https://opensource.googleblog.com/2024/02/magika-ai-powered-fast-and-efficient-file-type-identification.html), AI [PDF extraction](thepi.pe/docs), efficient [token compression](https://arxiv.org/abs/2403.12968), automatic [image encoding](https://en.wikipedia.org/wiki/Base64), [reranking](https://arxiv.org/abs/2310.06839) for [lost-in-the-middle](https://arxiv.org/abs/2307.03172) effects, and more, all pre-built to work out-of-the-box.

## Supported File Types 📚

Expand Down
10 changes: 5 additions & 5 deletions thepipe_api/extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,15 +28,15 @@
GITHUB_TOKEN: str = os.getenv("GITHUB_TOKEN")
THEPIPE_API_KEY: str = os.getenv("THEPIPE_API_KEY")

def extract_from_source(source: str, match: Optional[str] = None, ignore: Optional[str] = None, limit: int = 64000, verbose: bool = False, ai_extraction: bool = False, text_only: bool = False, local: bool = True) -> List[Chunk]:
def extract_from_source(source: str, match: Optional[str] = None, ignore: Optional[str] = None, limit: int = None, verbose: bool = False, ai_extraction: bool = False, text_only: bool = False, local: bool = True) -> List[Chunk]:
source_type = detect_type(source)
if source_type is None:
return [Chunk(path=source)]
raise ValueError(f"Could not detect source type for {source}.")
if verbose: print_status(f"Extracting from {source_type.value}", status='info')
if source_type == SourceTypes.DIR or source == '.' or source == './':
if source == '.' or source == './':
source = os.getcwd()
return extract_from_directory(dir_path=source, match=match, ignore=ignore, verbose=verbose, ai_extraction=ai_extraction, text_only=text_only)
return extract_from_directory(dir_path=source, match=match, ignore=ignore, verbose=verbose, ai_extraction=ai_extraction, text_only=text_only, limit=limit)
elif source_type == SourceTypes.GITHUB:
return extract_github(github_url=source, file_path='', match=match, ignore=ignore, text_only=text_only, verbose=verbose, ai_extraction=ai_extraction, branch='master')
elif source_type == SourceTypes.URL:
Expand Down Expand Up @@ -149,13 +149,13 @@ def should_ignore(file_path: str, ignore: Optional[str] = None) -> bool:
return True
return False

def extract_from_directory(dir_path: str, match: Optional[str] = None, ignore: Optional[str] = None, verbose: bool = False, ai_extraction: bool = False, text_only: bool = False) -> List[Chunk]:
def extract_from_directory(dir_path: str, match: Optional[str] = None, ignore: Optional[str] = None, verbose: bool = False, ai_extraction: bool = False, text_only: bool = False, limit: int = None) -> List[Chunk]:
all_files = glob.glob(dir_path + "/**/*", recursive=True)
matched_files = [file for file in all_files if re.search(match, file, re.IGNORECASE)] if match else all_files
file_paths = [file for file in matched_files if not should_ignore(file, ignore)]
contents = []
with ThreadPoolExecutor() as executor:
results = executor.map(lambda file_path: extract_from_source(source=file_path, match=match, ignore=ignore, verbose=verbose, ai_extraction=ai_extraction, text_only=text_only), file_paths)
results = executor.map(lambda file_path: extract_from_source(source=file_path, match=match, ignore=ignore, verbose=verbose, ai_extraction=ai_extraction, text_only=text_only, limit=limit), file_paths)
for result in results:
contents += result
return contents
Expand Down
2 changes: 1 addition & 1 deletion thepipe_api/thepipe.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ def parse_arguments() -> argparse.Namespace:
parser.add_argument('source', type=str, help='The source file or directory to compress.')
parser.add_argument('--match', type=str, default=None, help='The glob filename pattern to match in the directory. Glob notation, not regex. Only matches filenames, not paths.')
parser.add_argument('--ignore', type=str, default=None, help='The regex filepath pattern to ignore in the directory. Regex notation, not glob. Matches filenames and paths.')
parser.add_argument('--limit', type=float, default=1e5, help='The token limit for the compressed project context.')
parser.add_argument('--limit', type=int, default=None, help='The token limit for the compressed project context.')
parser.add_argument('--ai_extraction', action='store_true', help='Use ai_extraction to extract text from images.')
parser.add_argument('--text_only', action='store_true', help='Extract only text from the source.')
parser.add_argument('--quiet', action='store_true', help='Do not print status messages.')
Expand Down

0 comments on commit 0d6523a

Please sign in to comment.