-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathspidertest.prg
201 lines (131 loc) · 4.8 KB
/
spidertest.prg
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
*==============================================================================
* Purpose: SpiderTest.PRG
* Author: Bill Anderson
* Notice: Copyright (c) 2022 - 2025 The Anderson Files LLC, All Rights Reserved.
* Returns: Logical, indicating success.
* Date Added: 11/03/2022
*==============================================================================
# INCLUDE [Foxpro.H]
# DEFINE kcSpiderCache [C:\SpiderCache]
CLEAR ALL
CLOSE ALL
LOCAL loSpider AS [iSpider OF iChilkatVFP.VCX], loSeenDomains AS [iStringArray OF iChilkatVFP.VCX], ;
loSeedURLs AS [iStringArray OF iChilkatVFP.VCX], liCount AS Integer, lcURL AS Character, ;
lcDomain AS Character, li AS Integer, llSuccess AS Logical, ;
liOutboundLinks AS Integer, lcBaseDomain AS Character, llSpiderObject AS Logical, ;
llSeenDomainsObject AS Logical, llSeedURLsObject AS Logical
*!* LOCAL loChilkatVFPEventHandler && doesn't work!
PRIVATE loChilkatVFPEventHandler
STORE NULL TO loSpider, loSeenDomains, loSeedURLs, loChilkatVFPEventHandler
STORE [] TO lcURL, lcDomain, lcBaseDomain
STORE 0 TO liCount, li, liOutboundLinks
STORE .F. TO llSuccess, llSpiderObject, llSeenDomainsObject, llSeedURLsObject
** Alias not set test
IF NOT ([ChilkatVFP.VCX] $ UPPER(SET([CLASSLIB])))
SET CLASSLIB TO [ChilkatVFP.VCX] ADDITIVE
ENDIF NOT ([ChilkatVFP.VCX] $ UPPER(SET([CLASSLIB])))
** End alias not set test
** Alias not set test
IF NOT ([iChilkatVFP.VCX] $ UPPER(SET([CLASSLIB])))
SET CLASSLIB TO [iChilkatVFP.VCX] ADDITIVE
ENDIF NOT ([iChilkatVFP.VCX] $ UPPER(SET([CLASSLIB])))
** End alias not set test
CLEAR
**
** Go into the AbortCheck method of the BaseEventHandler.
** There is a line that begins with a question mark.
** Uncomment that line and save it.
**
loSpider = NEWOBJECT([Spider])
loSeenDomains = NEWOBJECT([StringArray])
loSeedURLs = NEWOBJECT([StringArray])
llSpiderObject = (TYPE([loSpider.Name]) == T_CHARACTER)
llSeenDomainsObject = (TYPE([loSeenDomains.Name]) == T_CHARACTER)
llSeedURLsObject = (TYPE([loSeedURLs.Name]) == T_CHARACTER)
llOKToContinue = (llSpiderObject AND llSeenDomainsObject AND llSeedURLsObject)
** Okay to continue test
IF llOKToContinue
WITH loSeedURLs
STORE .T. TO .lReturnBitAsLogical, .Unique
.Append([https://www.joelonsoftware.com/])
liCount = .Count
ENDWITH
WITH loSeenDomains
STORE .T. TO .lReturnBitAsLogical, .Unique
ENDWITH
WITH loSpider
STORE .T. TO .lReturnBitAsLogical, .FetchFromCache, .UpdateCache, .lAddEventHandler
.CacheDir = kcSpiderCache
** The line below is necessary for events to fire,
** even though it isn't referenced elsewhere in this program!
loChilkatVFPEventHandler = .oEventHandler
.HeartBeatMs = 500 && half second
** Walk through seed URLs loop
DO WHILE liCount > 0
lcURL = loSeedURLs.Pop()
.Initialize(lcURL)
lcDomain = .GetURLDomain(lcURL)
loSeenDomains.Append(.GetBaseDomain(lcDomain))
** Stop at five URLs loop
FOR li = 0 TO 4
llSuccess = .CrawlNext()
** Got to next link test
IF llSuccess
? .LastURL
** Not fetched from cache test
IF NOT .LastFromCache
.SleepMs(1000)
ENDIF NOT .LastFromCache
** End not fetched from cache test
ELSE
EXIT
ENDIF llSuccess
** End got to next link test
ENDFOR li = 0 TO 4
** End stop at five URLs loop
WITH loSeedURLs
.Remove(lcURL)
liCount = .Count
ENDWITH
ENDDO WHILE liCount > 0
** End walk through seed URLs loop
liOutboundLinks = .NumOutboundLinks - 1
** Walk through outbound links loop
FOR li = 0 TO liOutboundLinks
lcURL = .GetOutboundLink(li)
lcDomain = .GetURLDomain(lcURL)
lcBaseDomain = .GetBaseDomain(lcDomain)
** Already spidered test
IF NOT loSeenDomains.Contains(lcBaseDomain)
WITH loSeedURLs
** Add to list test
IF .Count < 1000
.Append(lcURL)
ENDIF .Count < 1000
** End add to list test
ENDWITH
ENDIF NOT loSeenDomains.Contains(lcBaseDomain)
** End already spidered test
ENDFOR li = 0 TO liOutboundLinks
** End walk through outbound links loop
ENDWITH
** End okay to continue test
ENDIF llOKToContinue
** End okay to continue test
** Have a Spider object test
IF llSpiderObject
loSpider.Release()
ENDIF llSpiderObject
** End have a Spider object test
** Have a Seen Domains object test
IF llSeenDomainsObject
loSeenDomains.Release()
ENDIF llSeenDomainsObject
** End have a Seen Domains object test
** Have a Seed URLs object test
IF llSeedURLsObject
loSeedURLs.Release()
ENDIF llSeedURLsObject
** End have a Seed URLs object test
STORE NULL TO loSpider, loSeenDomains, loSeedURLs
RETURN llSuccess