-
Notifications
You must be signed in to change notification settings - Fork 0
/
Program.cs
227 lines (192 loc) · 9.15 KB
/
Program.cs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
using System;
using System.Collections.Generic;
using System.Globalization;
using System.Net.Http;
using System.Text.RegularExpressions;
using System.Threading.Tasks;
using HtmlAgilityPack;
using CsvHelper;
class Program
{
private static readonly HttpClient _client = new HttpClient();
private static List<CompanyListing> _companyListings = new List<CompanyListing>();
private static bool _isExportEnabled = false;
private static bool _isEmailEnabled = false;
static async Task Main(string[] args)
{
string searchTerm = GetInput("Enter search Term:");
string geoLocation = GetInput("Enter Location:");
_isEmailEnabled = GetYesOrNoInput("Do you want to include Emails in list?");
_isExportEnabled = GetYesOrNoInput("Do you want the list exported to a CSV?");
string csvFileName = "" ;
if(_isExportEnabled)
{
string userDirectory = GetUserDirectory();
string csvDirectoryInput = GetInput($"Export Directory: (default: {userDirectory}/)");
string csvFileNameInput = GetInput("Exported File Name: (default name: company_listing)");
csvFileName = !string.IsNullOrEmpty(csvFileNameInput) ? $"{csvDirectoryInput}/{csvFileNameInput}.csv": $"{csvDirectoryInput}/company_listings.csv";
}
if (!string.IsNullOrWhiteSpace(searchTerm) && !string.IsNullOrWhiteSpace(geoLocation))
{
geoLocation.Replace(" ", "%20");
await StartScrape(searchTerm, geoLocation);
if(_isExportEnabled )
{
ExportToCsv(csvFileName);
}
}
else
{
Console.WriteLine("Can not search without searh term and geo location");
}
}
static string GetInput(string prompt)
{
Console.WriteLine(prompt);
return Console.ReadLine() ?? string.Empty;
}
static bool GetYesOrNoInput(string prompt)
{
Console.WriteLine($"{prompt} (y/n)");
var response = Console.ReadLine()?.Trim().ToLower();
return response == "y" || response == "yes";
}
static async Task StartScrape(string searchTerm, string geoLocation)
{
try
{
Console.WriteLine("\nStarting\n-------------------------------\n");
int pageStart = 1;
string responseBody = await GetWebpage(searchTerm, geoLocation, pageStart);
HtmlDocument htmlDoc = new HtmlDocument();
htmlDoc.LoadHtml(responseBody);
HtmlNode scrollNode = htmlDoc.DocumentNode.SelectSingleNode("//div[@class='scrollable-pane']");
HtmlNode getShowCount = scrollNode.SelectSingleNode(".//span[contains(@class,'showing-count')]");
String getCountText = getShowCount != null ? HtmlEntity.DeEntitize(getShowCount.InnerText.Trim()) : "N/A";
String countText = StripTotalCountText(getCountText);
int postCount = int.Parse(countText);
//Round up to get the last page post
int pageCount = RoundUp(Math.Ceiling((postCount / 30.0) * 100) / 100,0);
Console.WriteLine($"Total Count: {postCount}");
Console.WriteLine($"Total Pages to Search: {pageCount}");
Console.WriteLine("\n-------------------------------\n");
var result = scrollNode.SelectNodes(".//div[contains(@class, 'result')]");
await CreateCompanyListFromHtmlNode(result);
//Start at page 2 because we have page 1
for (var pageIndex = 2;pageIndex <= pageCount; pageIndex++)
{
Console.WriteLine($"Searching page: {pageIndex}");
string respBody = await GetWebpage(searchTerm, geoLocation, pageIndex);
HtmlDocument htmlDocOther = new HtmlDocument();
htmlDocOther.LoadHtml(respBody);
HtmlNode scrollNodeOther = htmlDocOther.DocumentNode.SelectSingleNode("//div[@class='scrollable-pane']");
var otherResult = scrollNodeOther.SelectNodes(".//div[contains(@class, 'result')]");
await CreateCompanyListFromHtmlNode(otherResult);
}
if (!_isExportEnabled)
{
foreach (var item in _companyListings)
{
Console.WriteLine(item.ToString());
Console.WriteLine("\n-------------------------------\n");
}
}
}
catch (HttpRequestException e)
{
Console.WriteLine("\nException Caught!");
Console.WriteLine("Message :{0} ", e.Message);
}
}
static async Task<string> GetCompanyWebpage(String route)
{
String input = $"https://www.yellowpages.com{route}";
using HttpResponseMessage response = await _client.GetAsync(input);
response.EnsureSuccessStatusCode();
return await response.Content.ReadAsStringAsync();
}
static async Task<string> GetWebpage(String searchTerm, String geoLocation, int pageIndex)
{
String input = $"https://www.yellowpages.com/search?search_terms={searchTerm}&geo_location_terms={geoLocation}&page={pageIndex}";
using HttpResponseMessage response = await _client.GetAsync(input);
response.EnsureSuccessStatusCode();
return await response.Content.ReadAsStringAsync();
}
public static string StripTotalCountText(string input)
{
string pattern = @"of (\d+)";
Match match = Regex.Match(input, pattern);
return match.Success ? match.Groups[1].Value : "";
}
static async Task CreateCompanyListFromHtmlNode(HtmlNodeCollection companyListings)
{
if (companyListings != null)
{
foreach (var companyListing in companyListings)
{
var nameNode = companyListing.SelectSingleNode(".//h2");
var numberNode = companyListing.SelectSingleNode(".//div[contains(@class, 'phone') and contains(@class, 'primary')]");
var addressNode = companyListing.SelectSingleNode(".//div[contains(@class, 'adr')]");
var websiteNode = companyListing.SelectSingleNode(".//a[contains(@class, 'track-visit-website')]");
var ypUrlNode = companyListing.SelectSingleNode(".//a[@href]");
var name = nameNode != null ? HtmlEntity.DeEntitize(nameNode.InnerText.Trim()) : "N/A";
string pattern = @"[\d\.]";
string nameClean = Regex.Replace(name, pattern, string.Empty);
if(nameClean.Trim() == "N/A") continue;
var number = numberNode != null ? HtmlEntity.DeEntitize(numberNode.InnerText.Trim()) : "N/A";
var address = addressNode != null ? HtmlEntity.DeEntitize(addressNode.InnerText.Trim()) : "N/A";
var website = websiteNode != null ? HtmlEntity.DeEntitize(websiteNode.GetAttributeValue("href", "N/A")) : "N/A";
var ypUrl = ypUrlNode != null ? ypUrlNode.GetAttributeValue("href", "N/A") : "N/A";
var email = _isEmailEnabled ? "" : "N/A";
if (ypUrl != "N/A" && _isEmailEnabled)
{
try
{
string responseBody = await GetCompanyWebpage(ypUrl);
HtmlDocument htmlDoc = new HtmlDocument();
htmlDoc.LoadHtml(responseBody);
HtmlNode emailNode = htmlDoc.DocumentNode.SelectSingleNode(".//a[@class='email-business']");
string mailto = emailNode != null ? emailNode.GetAttributeValue("href", "N/A") : "N/A";
email = Regex.Replace(mailto, @"^mailto:", string.Empty);
}
catch (HttpRequestException e)
{
Console.WriteLine("\nException Caught!");
Console.WriteLine("Message :{0} ", e.Message);
}
}
CompanyListing company = new CompanyListing(nameClean.Trim(), number, email, address, website, ypUrl);
_companyListings.Add(company);
}
_companyListings = _companyListings
.Where(c => c.Name != "N/A")
.OrderBy(c => c.Name)
.ToList();
}
else
{
Console.WriteLine("No results found.");
}
}
public static void ExportToCsv(string filePath)
{
using (var writer = new StreamWriter(filePath))
using (var csv = new CsvWriter(writer, CultureInfo.InvariantCulture))
{
csv.WriteRecords(_companyListings);
}
}
public static int RoundUp(double value, int decimalPoint)
{
var result = Math.Round(value, decimalPoint);
if (result < value)
{
result += Math.Pow(10, -decimalPoint);
}
return (int)result;
}
static string GetUserDirectory()
{
return Environment.GetFolderPath(Environment.SpecialFolder.UserProfile);
}
}