-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhelpers.py
90 lines (82 loc) · 3.71 KB
/
helpers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
# --- File: helpers.py ---
import os
import pathlib
import traceback
# --- magic import ---
try:
import magic
MAGIC_AVAILABLE = True
except ImportError:
MAGIC_AVAILABLE = False
print("Warning: python-magic library not found or libmagic is missing.")
print("Install it ('pip install python-magic' or 'pip install python-magic-bin')")
print("and ensure libmagic C library is installed on your system.")
print("Falling back to content-based text file detection.")
# --- tiktoken import ---
try:
import tiktoken
TIKTOKEN_AVAILABLE = True
except ImportError:
TIKTOKEN_AVAILABLE = False
print("Warning: tiktoken library not found. Token counts will not be available.")
print("Install it using: pip install tiktoken")
# --- Configuration (needed for is_text_file and calculate_tokens) ---
BINARY_CHECK_CHUNK_SIZE = 1024 # For is_text_file fallback check
TOKEN_ENCODING_NAME = "cl100k_base"
# --- Helper Functions ---
def is_text_file(file_path):
"""
Checks if a file is likely text-based using python-magic (if available)
or by inspecting the initial bytes as a fallback.
"""
if MAGIC_AVAILABLE:
try:
mime = magic.Magic(mime=True)
mime_type = mime.from_file(file_path)
# Common text types + JSON/XML often treated as text
if mime_type.startswith("text/") or mime_type in [
"application/json", "application/xml", "application/javascript",
"application/x-sh", "application/x-shellscript", "inode/x-empty" # Empty files are ok
]:
return True
# If magic identifies it as clearly binary, return False early
if "binary" in mime_type or "octet-stream" in mime_type or "application/" not in mime_type:
if not mime_type.startswith("application/"): # Broad catch-all
return False
except magic.MagicException as e:
print(f"Warning: python-magic failed for {file_path}: {e}. Falling back.")
except Exception as e: # Catch other potential magic errors
print(f"Warning: Unexpected error using python-magic for {file_path}: {e}. Falling back.")
# Fall through to content check
# Fallback: check content manually if magic unavailable or failed/inconclusive
try:
with open(file_path, 'rb') as f:
chunk = f.read(BINARY_CHECK_CHUNK_SIZE)
if not chunk: # Empty file is considered text
return True
# Check for null bytes - strong indicator of binary
if b'\0' in chunk:
return False
# Try decoding as UTF-8 (most common text encoding)
try:
chunk.decode('utf-8')
return True
except UnicodeDecodeError:
# If UTF-8 fails, it *might* still be text in another encoding
# but for aggregating code, lack of UTF-8 is a reasonable filter.
return False
except IOError: # Handle file not found or permission errors during fallback read
return False
except Exception as e:
print(f"Unexpected error during fallback text check for {file_path}: {e}")
return False
def calculate_tokens(text: str, encoding_name: str = TOKEN_ENCODING_NAME) -> int:
"""Calculates the number of tokens in a string using tiktoken."""
if not TIKTOKEN_AVAILABLE or not text: return 0
try:
encoding = tiktoken.get_encoding(encoding_name)
tokens = encoding.encode(text, disallowed_special=()) # Allow special tokens for more accurate count
return len(tokens)
except Exception as e:
print(f"Warning: Could not calculate tokens using '{encoding_name}': {e}")
return 0