-
Notifications
You must be signed in to change notification settings - Fork 23
/
Copy pathSpider.class.php
129 lines (117 loc) · 3.98 KB
/
Spider.class.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
<?php
require_once('util4p/CRObject.class.php');
class Spider
{
private $userAgent = 'Spider';
private $accept = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8';
private $acceptEncoding = 'gzip, deflate, br';
private $acceptLanguage = 'en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7';
private $cookie = '';
private $referer = '';
private $timeout = 15;
private $headers = array();
private $body = '';
private $info = array();
public function configure(CRObject $config)
{
$this->userAgent = $config->get('User-Agent', $this->userAgent);
$this->accept = $config->get('Accept', $this->accept);
$this->acceptEncoding = $config->get('Accept-Encoding', $this->acceptEncoding);
$this->acceptLanguage = $config->get('Accept-Encoding', $this->acceptLanguage);
$this->cookie = $config->get('Cookie', $this->cookie);
$this->referer = $config->get('Referer', $this->referer);
$this->timeout = $config->get('timeout', $this->timeout);
}
public function doGet($url)
{
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_TIMEOUT, $this->timeout);
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $this->timeout - 2);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_HTTPHEADER, array('Expect:')); //避免data数据过长问题
curl_setopt($ch, CURLOPT_POST, false);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
curl_setopt($ch, CURLOPT_HEADER, 1);
$ret = curl_exec($ch);
$err = curl_error($ch);
if (!$err) {
$header_size = curl_getinfo($ch, CURLINFO_HEADER_SIZE);
$header = substr($ret, 0, $header_size);
$headers = array();
// Split the string on every "double" new line.
$arrRequests = explode("\r\n\r\n", $header);
// Loop of response headers. The "count() -1" is to avoid an empty row for the extra line break before the body of the response.
for ($index = 0; $index < count($arrRequests) - 1; $index++) {
foreach (explode("\r\n", $arrRequests[$index]) as $i => $line) {
if ($i === 0)
$headers[$index]['http_code'] = $line;
else {
list($key, $value) = explode(': ', $line);
$headers[$index][$key] = $value;
}
}
}
$this->headers = $headers[max(0, count($headers) - 1)];
$this->body = substr($ret, $header_size);
$this->info = curl_getinfo($ch);
}
return !$err;
}
/*
*
* @param $url string
* @param $post_data array('key' => 'value')
*
* */
public function doPost($url, $post_data)
{
$fields_string = http_build_query($post_data);
//open connection
$ch = curl_init();
//set the url, number of POST vars, POST data
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_TIMEOUT, $this->timeout);
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $this->timeout - 2);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_POST, 1);
curl_setopt($ch, CURLOPT_POSTFIELDS, $fields_string);
curl_setopt($ch, CURLOPT_HEADER, 1);
$ret = curl_exec($ch);
$err = curl_error($ch);
if (!$err) {
$header_size = curl_getinfo($ch, CURLINFO_HEADER_SIZE);
$header = substr($ret, 0, $header_size);
$headers = array();
// Split the string on every "double" new line.
$arrRequests = explode("\r\n\r\n", $header);
// Loop of response headers. The "count() -1" is to avoid an empty row for the extra line break before the body of the response.
for ($index = 0; $index < count($arrRequests) - 1; $index++) {
foreach (explode("\r\n", $arrRequests[$index]) as $i => $line) {
if ($i === 0)
$headers[$index]['http_code'] = $line;
else {
list($key, $value) = explode(': ', $line);
$headers[$index][$key] = $value;
}
}
}
$this->headers = $headers[max(0, count($headers) - 1)];
$this->body = substr($ret, $header_size);
$this->info = curl_getinfo($ch);
}
return !$err;
}
public function getHeader($key)
{
return $key;
}
public function getBody()
{
return $this->body;
}
public function getStatusCode()
{
return $this->info['http_code'];
}
}