-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathPageChecker.php
127 lines (109 loc) · 3.32 KB
/
PageChecker.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
<?php
use Sunra\PhpSimple\HtmlDomParser;
class PageChecker
{
/**
* Google search URL
*/
const SEARCH_URL = 'http://www.google.co.uk/search?hl=en&tbo=d&site=&source=hp&';
/**
* Set number of pages to scrape
*/
const PAGES_COUNT = 5;
/**
* @var array
*/
private $data;
/**
* Results array
*
* @var array
*/
public $results = [];
/**
* PageChecker constructor.
*
* @param array $data
*/
public function __construct(array $data)
{
$this->data = $data;
}
/**
* @return string
*/
public function preparePhraseForGoogle($phrase)
{
return str_replace(' ', '+', $phrase);
}
/**
* @param $phrase
* @param $pager
* @return string
*/
public function prepareQueryStringForGoogle($phrase, $pager)
{
$queryData = [
'q' => $phrase,
'oq' => $phrase,
'start' => $pager
];
return self::SEARCH_URL . urldecode(http_build_query($queryData));
}
public function init()
{
foreach ($this->data as $domain => $phrases) {
foreach ($phrases as $phrase) {
$googlePhrase = $this->preparePhraseForGoogle($phrase);
$pageRank = 0;
for ($i = 0; $i < self::PAGES_COUNT*10; $i+= 10) {
$url = $this->prepareQueryStringForGoogle($googlePhrase, $i);
$html = HtmlDomParser::file_get_html($url);
if ($html) {
$linkObjs = $html->find('h3.r a');
foreach ($linkObjs as $linkObj) {
$pageRank++;
$link = trim($linkObj->href);
// if it is not a direct link but url reference found inside it, then extract
if (!preg_match('/^https?/', $link) && preg_match('/q=(.+)&sa=/U', $link, $matches) && preg_match('/^https?/', $matches[1])) {
$link = $matches[1];
} else if (!preg_match('/^https?/', $link)) { // skip if it is not a valid link
continue;
}
$cleanUrl = $this->parseUrl($link);
if ($cleanUrl === $domain) {
$this->results[$domain][$phrase] = ['pagerank' => $pageRank, 'link' => $link];
}
}
}
}
}
}
$this->output($this->results);
}
/**
* Get domain name only from URL in Google and remove www
*
* @param $url
* @return string
*/
public function parseUrl($url)
{
$parts = parse_url($url);
return preg_replace('#^www\.(.+\.)#i', '$1', $parts['host']);
}
/**
* Print results
*
* @param $results
*/
public function output($results)
{
foreach ($results as $domain => $phrases) {
foreach ($phrases as $phrase => $data) {
echo "$domain: Found in position " . $data['pagerank'] . " for phrase '$phrase'" . PHP_EOL;
echo "Search results link: " . $data['link'] . PHP_EOL . PHP_EOL;
}
}
}
}