-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathapp.py
181 lines (157 loc) Β· 6.13 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
import streamlit as st
from autoscraper import AutoScraper
import pandas as pd
import json
import os
from streamlit_tags import st_tags_sidebar
# Set the page configuration
st.set_page_config(
page_title="AutoScraper Streamlit App",
layout="wide",
initial_sidebar_state="expanded",
)
# App Title
st.title("AutoScraper Interactive Web Scraper")
# Create session state variables
if 'current_scraper' not in st.session_state:
st.session_state.current_scraper = None
if 'current_rules' not in st.session_state:
st.session_state.current_rules = None
if 'scraping_completed' not in st.session_state:
st.session_state.scraping_completed = False
if 'structured_result' not in st.session_state:
st.session_state.structured_result = None
# Sidebar sections
st.sidebar.header("π§ Scraping Configuration")
# Function to validate URL
def is_valid_url(url):
import re
regex = re.compile(
r'^(?:http|ftp)s?://' # http:// or https://
r'(?:\S+(?::\S*)?@)?' # user:pass@
r'(?:' # IP address exclusion
r'(?P<ip>(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])'
r'(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){3})|'
r'(?P<domain>'
r'(?:[a-zA-Z0-9-]+\.)+' # Domain name
r'[a-zA-Z]{2,}'
r'))(?::\d{2,5})?' # Optional port
r'(?:/\S*)?$'
)
return re.match(regex, url) is not None
# URL Input
url = st.sidebar.text_input("Enter the URL to scrape", "https://github.com/krishnaik06?tab=repositories")
# Validate URL
if not is_valid_url(url):
st.sidebar.error("Please enter a valid URL.")
# Wanted List Input
st.sidebar.subheader("Example Data Points (wanted_list)")
wanted_list = st_tags_sidebar(
label='Enter data points to scrape:',
text='Press enter to add more',
value=['Roadmap-To-Learn-Generative-AI-In-2024', '3,319'],
suggestions=[],
maxtags=-1,
key='wanted_list',
)
# Scraping Section
st.sidebar.subheader("π Run Scraper")
if st.sidebar.button("Start Scraping"):
if not is_valid_url(url):
st.error("Invalid URL. Please enter a valid URL to proceed.")
elif not wanted_list:
st.error("Please enter at least one data point in the wanted list.")
else:
with st.spinner("Building scraper and scraping data..."):
try:
scraper = AutoScraper()
scraper.build(url, wanted_list)
# Get structured results
structured_result = scraper.get_result_similar(url, grouped=True)
# Create rule aliases
rules_dict = {}
for i, (key, value) in enumerate(structured_result.items()):
alias = f"Column_{i+1}"
rules_dict[key] = alias
# Store in session state
st.session_state.current_scraper = scraper
st.session_state.current_rules = rules_dict
st.session_state.structured_result = structured_result
st.session_state.scraping_completed = True
except Exception as e:
st.error(f"An error occurred during scraping: {e}")
# Separate Scraper Management Section
st.sidebar.divider()
st.sidebar.header("πΎ Scraper Management")
# Save Scraper Section
save_col1, save_col2 = st.sidebar.columns([2, 1])
with save_col1:
scraper_name = st.text_input("Scraper Name", "my-scraper")
with save_col2:
if st.button("Save Scraper"):
if not st.session_state.scraping_completed:
st.sidebar.error("Please perform scraping first")
elif not scraper_name:
st.sidebar.error("Enter a name")
else:
try:
st.session_state.current_scraper.set_rule_aliases(st.session_state.current_rules)
st.session_state.current_scraper.save(scraper_name)
st.sidebar.success(f"Saved as '{scraper_name}'")
except Exception as e:
st.sidebar.error(f"Error saving: {e}")
# Show Saved Scrapers Section
saved_scrapers = [f for f in os.listdir() if os.path.isfile(f) and
not os.path.splitext(f)[1] and # no extension
not f.startswith('.') and # not hidden files
not f in ['LICENSE', 'README', 'Dockerfile']] # exclude common files
if saved_scrapers:
st.sidebar.subheader("π Saved Scrapers")
# Create a container for saved scrapers list
saved_list = st.sidebar.container()
with saved_list:
for scraper in saved_scrapers:
col1, col2 = st.columns([3, 1])
with col1:
st.text(scraper)
with col2:
if st.button("ποΈ", key=f"delete_{scraper}"):
try:
os.remove(scraper)
st.rerun()
except Exception as e:
st.error(f"Error deleting {scraper}: {e}")
st.sidebar.divider()
# Display Results Section
if st.session_state.scraping_completed and st.session_state.structured_result:
st.header("π Structured Results")
# Display structured results
with st.expander("π JSON RAW Results", expanded=False):
for key, value in st.session_state.structured_result.items():
st.markdown(f"**{key}:**")
st.write(value)
# Convert to DataFrame
df = pd.DataFrame(st.session_state.structured_result)
st.dataframe(df)
# Download Section
st.subheader("πΎ Download Scraped Data")
csv = df.to_csv(index=False).encode('utf-8')
json_data = df.to_json(orient='records', lines=True)
col1, col2 = st.columns(2)
with col1:
st.download_button(
label="Download as CSV",
data=csv,
file_name='scraped_data.csv',
mime='text/csv',
)
with col2:
st.download_button(
label="Download as JSON",
data=json_data,
file_name='scraped_data.json',
mime='application/json',
)
# Footer
st.markdown("---")
st.markdown("Developed with β€οΈ using [Streamlit](https://streamlit.io/) and [AutoScraper](https://github.com/alirezamika/autoscraper).")