-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathExamPaperSpider.py
197 lines (172 loc) · 7.83 KB
/
ExamPaperSpider.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
from typing import Any, List, Union
import requests
from HtmlParser import HtmlParser
from ExamPaper import ExamPaper, SingleQuestion, MultipleQuestion, JudgeQuestion, FillBlankQuestion
from time import sleep
from tqdm import tqdm
import pandas as pd
class Spider:
url: str
headers: Union[dict, None]
cookies: Union[dict, None]
params: Union[dict, None]
method: str
timeout: int
def __init__(self,
headers: Union[dict, None] = None,
cookies: Union[dict, None] = None) -> None:
self.headers = headers
self.cookies = cookies
def sendRequest(self,
url: str,
method: str = 'GET',
params: Union[dict, None] = None,
timeout: int = 10) -> str:
response = requests.request(method,
url,
headers=self.headers,
cookies=self.cookies,
params=params,
timeout=timeout)
return response.text
def __call__(self,
url: str,
method: str = 'GET',
params: Union[dict, None] = None,
timeout: int = 10):
return self.sendRequest(url, method, params, timeout)
class ExamPaperSpider(Spider):
def __init__(
self,
headers: dict,
cookies: dict,
timeSleep: float = 0.5,
) -> None:
super().__init__(headers, cookies)
self.examPaperUrl = "https://mooc1-api.chaoxing.com/exam-ans/exam/phone/examcode"
self.lookDetailUrl = "https://mooc1-api.chaoxing.com/exam/phone/look-detail"
self.lookInfoUrl = "https://mooc1-api.chaoxing.com/exam-ans/exam/phone/look-detail"
self.timeSleep = timeSleep
def setParams(self, parmas: dict) -> None:
self.infoParams = parmas.copy()
self.detailParams = parmas.copy()
self.detailParams["isDetail"] = 1
def getExamPaperParams(self):
response = self.sendRequest(self.examPaperUrl, 'GET')
parser = HtmlParser(response)
entryPoints = parser.getEntryPointsUrl()
params = []
for entryPoint in entryPoints:
response = self.sendRequest(entryPoint, 'GET')
parser = HtmlParser(response)
examPaperParam = parser.getExamPaperParams()
params.append(examPaperParam)
return params
def getExamPaperInfos(self) -> None:
self.infoParams["isDetail"] = 0
response = self.sendRequest(self.lookInfoUrl,
'GET',
params=self.infoParams)
parser = HtmlParser(response)
self.singleQuestionIds = parser.getSingleSelectIds()
self.multipleQuestionIds = parser.getMultiSelectIds()
self.judgeQuestionIds = parser.getJudgeIds()
self.fillQuestionIds = parser.getFillIds()
def getSingleQuestions(self, pbar: tqdm) -> List[SingleQuestion]:
singleQuestions: List[SingleQuestion] = []
_params = self.detailParams.copy()
for qid in self.singleQuestionIds:
pbar.set_description(f"正在获取单选题: {qid}")
_params["questionLinkId"] = qid
response = self.sendRequest(self.lookDetailUrl,
'GET',
params=_params)
parser = HtmlParser(response)
singleQuestions.append(parser.parseSingleSelect(qid))
sleep(self.timeSleep) # 延迟一段时间,防止请求过快
pbar.update(1)
return singleQuestions
def getMultipleQuestions(self, pbar: tqdm) -> List[MultipleQuestion]:
multiQuestions: List[MultipleQuestion] = []
_params = self.detailParams.copy()
for qid in self.multipleQuestionIds:
pbar.set_description(f"正在获取多选题: {qid}")
_params["questionLinkId"] = qid
response = self.sendRequest(self.lookDetailUrl,
'GET',
params=_params)
parser = HtmlParser(response)
multiQuestions.append(parser.parseMultiSelect(qid))
sleep(self.timeSleep) # 延迟一段时间,防止请求过快
pbar.update(1)
return multiQuestions
def getJudgeQuestions(self, pbar: tqdm) -> List[JudgeQuestion]:
judgeQuestions: List[JudgeQuestion] = []
_params = self.detailParams.copy()
for qid in self.judgeQuestionIds:
pbar.set_description(f"正在获取判断题: {qid}")
_params["questionLinkId"] = qid
response = self.sendRequest(self.lookDetailUrl,
'GET',
params=_params)
parser = HtmlParser(response)
judgeQuestions.append(parser.parseJudge(qid))
sleep(self.timeSleep) # 延迟一段时间,防止请求过快
pbar.update(1)
return judgeQuestions
def getFillQuestions(self, pbar: tqdm) -> List[FillBlankQuestion]:
fillQuestions: List[FillBlankQuestion] = []
_params = self.detailParams.copy()
for qid in self.fillQuestionIds:
pbar.set_description(f"正在获取填空题: {qid}")
_params["questionLinkId"] = qid
response = self.sendRequest(self.lookDetailUrl,
'GET',
params=_params)
parser = HtmlParser(response)
fillQuestions.append(parser.parseFill(qid))
sleep(self.timeSleep) # 延迟一段时间,防止请求过快
pbar.update(1)
return fillQuestions
def getExamPaper(self, examId: int) -> ExamPaper:
self.getExamPaperInfos()
pbar = tqdm(total=len(self.singleQuestionIds) +
len(self.multipleQuestionIds) +
len(self.judgeQuestionIds) + len(self.fillQuestionIds),
desc=f"正在获取试卷: {examId}")
examPaper = ExamPaper(examId)
examPaper.singleQuestionList = self.getSingleQuestions(pbar)
examPaper.multiQuestionList = self.getMultipleQuestions(pbar)
examPaper.judgeQuestionList = self.getJudgeQuestions(pbar)
examPaper.fillBlankQuestionList = self.getFillQuestions(pbar)
return examPaper
class TestExamPaperSpider:
def __init__(self, headers: dict, cookies: dict, parmas: dict) -> None:
self.spider = ExamPaperSpider(headers, cookies)
def testGetExamPaperInfos(self):
self.spider.getExamPaperInfos()
print(self.spider.singleQuestionIds)
print(self.spider.multipleQuestionIds)
print(self.spider.judgeQuestionIds)
print(self.spider.fillQuestionIds)
def testGetSingleQuestions(self):
pbar = tqdm(total=len(self.spider.singleQuestionIds))
singleSelectorList = self.spider.getSingleQuestions(pbar)
data = [selector.fatten() for selector in singleSelectorList]
print(data)
df = pd.DataFrame(data)
df.to_csv('single.csv', index=False, encoding='gbk')
def testGetFillQuestions(self):
pbar = tqdm(total=len(self.spider.fillQuestionIds))
fillBlankSelectorList = self.spider.getFillQuestions(pbar)
for selector in fillBlankSelectorList:
print(selector)
if __name__ == '__main__':
from Config import Config
config = Config(
r'D:\Repositories\spider\chaoxing_mooc_spider\test\config.yaml')
testExamPaperSpider = TestExamPaperSpider(config.headers, config.cookies,
config.params[0])
testExamPaperSpider.testGetExamPaperInfos()
testExamPaperSpider.testGetSingleQuestions()
# testExamPaperSpider.testGetFillQuestions()