新建爬虫类

把一些基础的信息初始化,方便在之后的方法中增加、删除、修改、使用这些数据。这些信息,在搜索课程和选课时都会用到。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
class LessonSpider:
def __init__(self, number, password, name):
self.number = number
self.password = password
self.name = name
self.login = Lg.LoginSpider(number, password)
self.url = 'http://xk.zucc.edu.cn/'
self.base_data = {
'__EVENTTARGET': '',
'__EVENTARGUMENT': '',
'__VIEWSTATE': '',
'ddl_kcxz': '',
'ddl_ywyl': '',
'ddl_kcgs': '',
'ddl_xqbs': '1',
'ddl_sksj': '',
'TextBox1': '',
'dpkcmcGrid:txtChoosePage': '1',
'dpkcmcGrid:txtPageSize': '200',
}
self.count_lesson = 0

获取选课信息

Chrome F12 进行逆向工程(俗称抓包),查看正常浏览(网上选课——公选课)时,会发出哪些数据
avatar
avatar
avatar

  • 请求的URL就是网址+‘xf_xsqxxxk.aspx’
  • 构造要发送的数据包中有学号,gb2312编码的姓名和gnmkdm
  • headers中需要增加Referer,就是当前页面的地址
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
def hello_zf(self):
while not self.login.login_ocr():
continue

data = {
'xh': self.number,
'xm': self.name.encode('gb2312'),
'gnmkdm': 'N121103',
}

self.login.headers['Referer'] = self.url + 'xs_main.aspx?xh=' + self.login.number
response = self.login.s.get(self.url + 'xf_xsqxxxk.aspx', params=data, headers=self.login.headers)

self.login.headers['Referer'] = response.url
selector = etree.HTML(response.text)
self.set_view_state(selector)
self.count_lesson = already(selector)

抓取已经选择的课程信息

用lxml解析get到的公选课课程页面,通过xpath定位到已经选择的课程列表。另外计数已经选择的课程,用于之后判断选课是否成功

1
2
3
4
5
6
7
8
9
def already(selector):
selected_lessons_pre_tag = selector.xpath('//table[@id="DataGrid2"]/tr')
print('\n正方你好!已经选择的公选课为:')
count = 0
for i in selected_lessons_pre_tag:
course = i.xpath('td[1]/text()') + i.xpath('td[7]/text()') + i.xpath('td[8]/text()')
print(course)
count += 1
return count

新建课程类

通过新建一个课程类,来方便管理获取到的课程信息

其中code被用作公选课选课时的提交代码,构造数据包时要用到

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
class Lesson:
def __init__(self, num, nam, code, teacher_name, time, surplus):
self.number = num
self.name = nam
self.code = code
self.teacher_name = teacher_name
self.time = time
self.surplus = surplus

def show(self):
print('课程代码:' + self.number
+ '\t课程名:' + self.name
+ '\t教师:' + self.teacher_name
+ '\t时间:' + self.time
+ '\t余量:' + self.surplus)

搜索课程

发送查询请求

抓取了搜索课程的包,忽略值为空的键

  • __VIEWSTATE: 每提交一次请求,会get到一个新的值,并且在下一次请求中必须使用最新的值,定义一个函数用以更新这个值
1
2
3
def set_view_state(self, selector):
__VIEWSTATE = selector.xpath('//*[@id="xsyxxxk_form"]/input[3]/@value')[0]
self.base_data['__VIEWSTATE'] = __VIEWSTATE
  • ddl_ywyl: ‘%D3%D0’ 中文“有”的gb2312编码,表示查询还有余量的公选课;’%CE%DE’表示“无”;为空表示查询所有

  • ddl_xqbs: 1 相传这个是校区代码,每个学校都不一样。这里就保持1就OK

  • TextBox1: 要查找的课程名的GB2312编码

  • dpkcmcGrid:txtChoosePage: 1 表示选择的页数,默认1

  • dpkcmcGrid:txtPageSize: 200 为一页显示多少数据,经过测试,服务器最多响应200条,公选课也就一百多门,这里写200也就保证了上面那个值写1也能抓到所有课程。如果有一天,超过200门公选课了,就需要修改代码,循环抓取每一页的课程了

  • Button2: 确定的GB2312编码,相当于查询时的那个确定按钮

1
2
3
4
5
6
7
8
9
10
def search_lessons(self, lesson_name=""):
self.base_data['TextBox1'] = lesson_name.encode('gb2312')
self.base_data['Button2'] = '确定'.encode('gb2312')
response = self.login.s.post(self.login.headers
['Referer'], data=self.base_data, headers=self.login.headers)
selector = etree.HTML(response.text)
self.set_view_state(selector)
del self.base_data['Button2']
del self.base_data['TextBox1']
return selector

获取课程信息

lxml解析查询得到的页面,xpath定位所有课程信息的位置,转换为Lesson的对象存入元组中方便选课

1
2
3
4
5
6
7
8
9
10
11
12
13
def get_lessons(selector):
lesson_list = []
lessons_tag_list = selector.xpath('//table[@id="kcmcGrid"]/tr[1]/following-sibling::tr')
for lessons_tag in lessons_tag_list:
code = lessons_tag.xpath('td[1]/input/@name')[0]
num = lessons_tag.xpath('td[2]/a/@onclick')[0][52:81]
na = lessons_tag.xpath('td[2]/a/text()')[0]
teacher_name = lessons_tag.xpath('td[4]/a/text()')[0]
time = lessons_tag.xpath('td[5]/@title')[0]
surplus = lessons_tag.xpath('td[11]/text()')[0]
lesson = Lesson.Lesson(num, na, code, teacher_name, time, surplus)
lesson_list.append(lesson)
return lesson_list

进行公选课选课

这里post的数据包比起基础包要多两个
avatar

  • Button1: ‘ 提交 ‘的GB2312码(提交前后分别有两个空格)
  • 之前说的Lesson类中有一个code,就是这里需要提交的数据里面的键,对应的值是‘on’,相当于是勾选了每门课程前面的那个checkbo类型的input。审查元素的时候,可以在每门课程的input标签中name属性和id属性中获得
    avatar
1
2
3
4
5
6
7
8
9
10
11
12
13
14
def select_lesson(self, lesson_list):
data = copy.deepcopy(self.base_data)
data['Button1'] = ' 提交 '.encode('gb2312')
print("\n正在抢课:")
for lesson in lesson_list:
code = lesson.code
data[code] = 'on'
print(lesson.name)
response = self.login.s.post(self.login.headers['Referer'], data=data, headers=self.login.headers)
selector = etree.HTML(response.text)
show_error(selector)
self.set_view_state(selector)
self.count_lesson = already(selector)
return len(lesson_list)

这里还用到一个显示错误的函数,原理也是解析页面后查看html>head>script的值,用正则表达式查找是否有alert的弹窗,显示alert中的内容

1
2
3
4
5
6
7
def show_error(selector):
error_tags = selector.xpath('/html/head/script/text()')
for error_tag in error_tags:
if error_tag:
r = "alert\('(.+?)'\);"
for s in re.findall(r, error_tag):
print('\n' + s)

完整代码

Lesson.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
import re


def show_error(selector):
error_tags = selector.xpath('/html/head/script/text()')
for error_tag in error_tags:
if error_tag:
r = "alert\('(.+?)'\);"
for s in re.findall(r, error_tag):
print('\n' + s)


class Lesson:
def __init__(self, num, name, code, teacher_name, time, surplus):
self.number = num
self.name = name
self.code = code
self.teacher_name = teacher_name
self.time = time
self.surplus = surplus

def show(self):
print('课程代码:' + self.number
+ '\t课程名:' + self.name
+ '\t教师:' + self.teacher_name
+ '\t时间:' + self.time
+ '\t余量:' + self.surplus)

PublicCourseSpider.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
import copy
import Login as Lg
from lxml import etree
import Lesson


def get_lessons(selector):
lesson_list = []
lessons_tag_list = selector.xpath('//table[@id="kcmcGrid"]/tr[1]/following-sibling::tr')
for lessons_tag in lessons_tag_list:
code = lessons_tag.xpath('td[1]/input/@name')[0]
num = lessons_tag.xpath('td[2]/a/@onclick')[0][52:81]
na = lessons_tag.xpath('td[2]/a/text()')[0]
teacher_name = lessons_tag.xpath('td[4]/a/text()')[0]
time = lessons_tag.xpath('td[5]/@title')[0]
surplus = lessons_tag.xpath('td[11]/text()')[0]
lesson = Lesson.Lesson(num, na, code, teacher_name, time, surplus)
lesson_list.append(lesson)
return lesson_list


def already(selector):
selected_lessons_pre_tag = selector.xpath('//table[@id="DataGrid2"]/tr')
print('\n正方你好!已经选择的公选课为:')
count = 0
for i in selected_lessons_pre_tag:
course = i.xpath('td[1]/text()') + i.xpath('td[7]/text()') + i.xpath('td[8]/text()')
print(course)
count += 1
return count


class PublicLessonSpider:
def __init__(self, stu_number, stu_password, stu_name):
self.number = stu_number
self.password = stu_password
self.name = stu_name
self.login = Lg.LoginSpider(stu_number, stu_password)
self.url = 'http://xk.zucc.edu.cn/'
self.base_data = {
'__EVENTTARGET': '',
'__EVENTARGUMENT': '',
'__VIEWSTATE': '',
'ddl_kcxz': '',
'ddl_ywyl': '',
'ddl_kcgs': '',
'ddl_xqbs': '1',
'ddl_sksj': '',
'TextBox1': '',
'dpkcmcGrid:txtChoosePage': '1',
'dpkcmcGrid:txtPageSize': '200',
}
self.count_lesson = 0

def hello_zf(self):
while not self.login.login_ocr():
continue

data = {
'xh': self.number,
'xm': self.name.encode('gb2312'),
'gnmkdm': 'N121103',
}

self.login.headers['Referer'] = self.url + 'xs_main.aspx?xh=' + self.login.number
response = self.login.s.get(self.url + 'xf_xsqxxxk.aspx', params=data, headers=self.login.headers)

self.login.headers['Referer'] = response.url
selector = etree.HTML(response.text)
self.set_view_state(selector)
return selector

def set_view_state(self, selector):
__VIEWSTATE = selector.xpath('//*[@id="xsyxxxk_form"]/input[3]/@value')[0]
self.base_data['__VIEWSTATE'] = __VIEWSTATE

def search_lessons(self, lesson_name=""):
self.base_data['TextBox1'] = lesson_name.encode('gb2312')
self.base_data['Button2'] = '确定'.encode('gb2312')
response = self.login.s.post(self.login.headers['Referer'], data=self.base_data, headers=self.login.headers)
selector = etree.HTML(response.text)
self.set_view_state(selector)
del self.base_data['Button2']
del self.base_data['TextBox1']
return selector

def select_lesson(self, lesson_list):
data = copy.deepcopy(self.base_data)
data['Button1'] = ' 提交 '.encode('gb2312')
print("\n正在抢课:")
for lesson in lesson_list:
data[lesson.code] = 'on'
print(lesson.name)
response = self.login.s.post(self.login.headers['Referer'], data=data, headers=self.login.headers)
selector = etree.HTML(response.text)
Lesson.show_error(selector)
self.set_view_state(selector)
self.count_lesson = already(selector)
return len(lesson_list)

def run(self):
selector = self.hello_zf()
self.count_lesson = already(selector)
print('请输入课程名字进行搜索(准确查找|直接回车显示所有公选课)')
lesson_name = input()
lesson_list = get_lessons(self.search_lessons(lesson_name))
select_list = []

while True:
for i in range(len(lesson_list)):
print(i, end='\t')
lesson_list[i].show()
print('请输入想选的课的id,id为每门课程开头的数字,如果没有课程显示,代表暂时没有公选课,输入其他任意字符表示完成')
select_id = input()
if select_id.isdigit():
select_id = int(select_id)
if 0 <= select_id < len(lesson_list):
select_list.append(lesson_list[select_id])
else:
break
else:
break

num = self.count_lesson
while True:
want = self.select_lesson(select_list)
if self.count_lesson >= num + want:
break
else:
print("\n抢课失败")


if __name__ == "__main__":
number, password, name = Lg.get_information()
spider = PublicLessonSpider(number, password, name)
spider.run()

完整项目包括登录、获取信息和自动选课