-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathstreamlit_app.py
242 lines (205 loc) · 8.48 KB
/
streamlit_app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
import streamlit as st
import asyncio
from jina_reader import JinaReader
from firecrawl_scraper import FireCrawlScraper
from basic_scraper import BasicScraper
from scrapegraph_ai import ScrapeGraphAI, GraphType
import json
import os
from datetime import datetime
# Initialize session state for API keys
if 'api_keys' not in st.session_state:
st.session_state.api_keys = {
'jina': '',
'firecrawl': '',
'openai': ''
}
def init_page():
st.set_page_config(
page_title="AI Web Scraping Tools",
page_icon="🌐",
layout="wide",
initial_sidebar_state="expanded"
)
st.title("🌐 AI Web Scraping Tools")
def handle_api_keys():
"""Handle API keys in sidebar"""
with st.sidebar:
st.header("🔑 API Key Management")
# Add expander for API key inputs
with st.expander("Configure API Keys"):
st.session_state.api_keys['jina'] = st.text_input(
"Jina API Key",
type="password",
value=st.session_state.api_keys['jina']
)
st.session_state.api_keys['firecrawl'] = st.text_input(
"FireCrawl API Key",
type="password",
value=st.session_state.api_keys['firecrawl']
)
st.session_state.api_keys['openai'] = st.text_input(
"OpenAI API Key",
type="password",
value=st.session_state.api_keys['openai']
)
def save_results(results, scraper_type):
"""Save results with timestamp"""
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"results_{scraper_type}_{timestamp}.json"
# Create results directory if it doesn't exist
os.makedirs("results", exist_ok=True)
filepath = os.path.join("results", filename)
with open(filepath, 'w', encoding='utf-8') as f:
json.dump(results, f, indent=4)
return filepath
def display_results(results, scraper_type):
"""Display results and download button"""
st.success("✅ Scraping completed successfully!")
# Display results in tabs
tab1, tab2 = st.tabs(["📊 Formatted Results", "📝 Raw JSON"])
with tab1:
if scraper_type == "Basic Scraper":
# Display headings
if "content" in results and "headings" in results["content"]:
st.write("### 📑 Headings")
for heading in results["content"]["headings"]:
st.write(f"{heading['type'].upper()}: {heading['text']}")
# Display paragraphs
if "content" in results and "paragraphs" in results["content"]:
st.write("### 📝 Paragraphs")
for idx, para in enumerate(results["content"]["paragraphs"], 1):
with st.expander(f"Paragraph {idx}"):
st.write(para)
# Display links
if "content" in results and "links" in results["content"]:
st.write("### 🔗 Links")
for link in results["content"]["links"]:
st.write(f"- [{link['text']}]({link['url']})")
else:
st.write("### Extracted Data")
st.write(results)
with tab2:
st.json(results)
# Create download button for JSON
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"scraped_data_{timestamp}.json"
# Convert results to JSON string
json_str = json.dumps(results, indent=2)
# Create download button
st.download_button(
label="📥 Download JSON",
data=json_str,
file_name=filename,
mime="application/json",
)
def run_basic_scraper(url, article_selector):
with st.spinner("🔍 Scraping webpage..."):
try:
scraper = BasicScraper()
results = scraper.scrape_webpage(url, article_selector)
if "error" in results:
st.error(f"Scraping error: {results['error']}")
return results
if not any(results["content"].values()):
st.warning("No content was extracted. Try adjusting the selector or check if the website allows scraping.")
return results
except Exception as e:
st.error(f"Error during scraping: {str(e)}")
return {"error": str(e), "success": False}
async def run_firecrawl(url, api_key):
with st.spinner("🔄 Processing with FireCrawl..."):
scraper = FireCrawlScraper(api_key)
results = await scraper.scrape_url_async(url)
status = await scraper.check_crawl_status(results['crawl_id'])
return {"crawl_results": results, "status": status}
def run_jina_reader(url, api_key, action, max_length=None):
with st.spinner("🤖 Processing with Jina AI..."):
reader = JinaReader(api_key)
if action == "Read URL":
return reader.read_url(url)
elif action == "Summarize":
content = reader.read_url(url)
if content.get("text"):
return {"summary": reader.summarize(content["text"], max_length)}
else: # Extract Images
return {"images": reader.extract_images(url)}
def run_scrapegraph(url, api_key, graph_type, prompt):
with st.spinner("🎯 Processing with ScrapeGraphAI..."):
config = {
"llm": {
"model": "gpt-3.5-turbo",
"temperature": 0.7,
"api_key": api_key
}
}
scraper = ScrapeGraphAI(config)
graph = scraper.create_graph(
GraphType[graph_type],
url=url,
prompt=prompt
)
return scraper.execute_graph(graph)
def validate_api_key(key_name: str) -> bool:
"""Validate if an API key is present"""
if not st.session_state.api_keys.get(key_name):
st.error(f"⚠️ {key_name.capitalize()} API Key is required")
return False
return True
def main():
init_page()
handle_api_keys()
# Main content area
col1, col2 = st.columns([2, 1])
with col1:
url = st.text_input("🔗 Enter URL to scrape", "https://example.com")
with col2:
scraper_type = st.selectbox(
"🔧 Select Scraper",
["Basic Scraper", "FireCrawl Scraper", "Jina Reader", "ScrapeGraphAI"]
)
# Scraper-specific inputs and processing
try:
if scraper_type == "Basic Scraper":
article_selector = st.text_input("Enter article selector (optional)", "article-title")
if st.button("🚀 Start Scraping"):
results = run_basic_scraper(url, article_selector)
display_results(results, scraper_type)
elif scraper_type == "FireCrawl Scraper":
if st.button("🚀 Start Scraping") and validate_api_key('firecrawl'):
results = asyncio.run(run_firecrawl(url, st.session_state.api_keys['firecrawl']))
display_results(results, scraper_type)
elif scraper_type == "Jina Reader":
action = st.selectbox("Select Action", ["Read URL", "Summarize", "Extract Images"])
max_length = None
if action == "Summarize":
max_length = st.slider("Summary Length", 100, 500, 200)
if st.button("🚀 Start Processing") and validate_api_key('jina'):
results = run_jina_reader(
url,
st.session_state.api_keys['jina'],
action,
max_length
)
display_results(results, scraper_type)
elif scraper_type == "ScrapeGraphAI":
graph_type = st.selectbox(
"Select Graph Type",
["SMART_SCRAPER", "SEARCH", "OMNI_SCRAPER"]
)
prompt = st.text_area("Enter Prompt", "Extract product information")
if st.button("🚀 Start Processing") and validate_api_key('openai'):
results = run_scrapegraph(
url,
st.session_state.api_keys['openai'],
graph_type,
prompt
)
display_results(results, scraper_type)
except ValueError as e:
st.error(f"❌ Configuration Error: {str(e)}")
except Exception as e:
st.error(f"❌ An error occurred: {str(e)}")
st.exception(e)
if __name__ == "__main__":
main()