新建爬虫类

初始化基础信息

1
2
3
4
5
6
7
8
class PlannedCourseSpider:
def __init__(self, stu_number, stu_password, stu_name):
self.number = stu_number
self.password = stu_password
self.name = stu_name
self.login = Lg.LoginSpider(stu_number, stu_password)
self.url = 'http://xk.zucc.edu.cn/'
self.__VIEWSTATE = ''

新建课程类

通过新建一个课程类,来方便管理获取到的课程信息

其中code被用作选课时的提交代码,构造数据包时要用到

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
class Lesson:
def __init__(self, num, nam, code, teacher_name, time, surplus):
self.number = num
self.name = nam
self.code = code
self.teacher_name = teacher_name
self.time = time
self.surplus = surplus

def show(self):
print('课程代码:' + self.number
+ '\t课程名:' + self.name
+ '\t教师:' + self.teacher_name
+ '\t时间:' + self.time
+ '\t余量:' + self.surplus)

获取计划内课程

进入计划内课程页面

常规操作,逆向工程查看请求的URL和发送的数据

avatar
avatar

1
2
3
4
5
6
7
8
9
10
11
12
13
14
def hello_zf(self):
data = {
'xh': self.number,
'xm': self.name.encode('gb2312'),
'gnmkdm': 'N121103',
}

self.login.headers['Referer'] = self.url + 'xs_main.aspx?xh=' + self.login.number
response = self.login.s.get(self.url + 'xsxk.aspx', params=data, headers=self.login.headers)

self.login.headers['Referer'] = response.url

selector = etree.HTML(response.text)
return selector

解析获取所有课程

分析网页源码,课程信息的存放大同小异,都是在一张table里面,第一项是抬头,获取抬头的兄弟节点。这里要注意这张网页的table最后有一个无效的tr标签,根据那个判断是否读完整张表。读取的信息可以存在之前用过的Lesson类里。这里用了正则表达式匹配课程代码,这个代码用于之后获取该课程的所有开课信息

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
def get_all_lesson(selector):
lesson_list = []
lessons_tag_list = selector.xpath('//table[@id="kcmcgrid"]/tr[1]/following-sibling::tr')
for lesson_tag in lessons_tag_list:
num = lesson_tag.xpath('td[1]/a/text()')
# 检验是否为无效的tr标签
if num:
num = num[0]
else:
break
code = lesson_tag.xpath('td[1]/a/@onclick')[0]
pattern = "xkkh(.+?)xh"
code = re.search(pattern, code).group()[5:-3]
lesson_name = lesson_tag.xpath('td[2]/a/text()')[0]
surplus = lesson_tag.xpath('td[10]/text()')[0]
lesson = Lesson.Lesson(num, lesson_name, code, "待选择", "待选择", surplus)
lesson_list.append(lesson)
return lesson_list

显示获取到的所有计划内课程,选择查看详细课程信息

把所有获取到的课程显示初恋,调用Lesson类的自带方法,选课也可以用到相同的方法,于是封装成一个类

1
2
3
4
5
6
7
8
def show_and_select_lessons(lesson_list):
for i in range(len(lesson_list)):
print(i, end=' ')
lesson_list[i].show()
select_id = int(input("请输入想选的课的id,id为每门课程开头的数字,如果没有课程显示,代表没有获取到计划内课程: "))
while select_id < 0 or len(lesson_list) <= select_id:
select_id = int(input("错误的ID,请重新输入: "))
return select_id

获取课程的所有开课信息

根据选择的计划内课程,获取到那门课程的所有开课信息。

进入课程的信息页面

常规操作遇到坑,直接模拟获取信息的时候会在新窗口中打开。所以要按住Ctrl新建标签页面打开开课信息页面,然后在新打开的窗口中F12抓包。此时需要再发一次请求才能抓包,所以回到所有计划内课程页面,按住Ctrl再重新点击一次课程,之前打开抓包的那个页面就会刷新一次,抓到的数据也是需要的数据了

avatar

没啥新鲜的,Headers的referer改为发送请求的url,xkkh就是之前获取所有计划内课程信息里面的code,xh就是学号

1
2
3
4
5
6
7
8
9
10
11
12
13
def hello_lesson(self, xkkh):
data = {
'xh': self.number,
'xkkh': xkkh,
}

response = self.login.s.get(self.url + 'xsxjs.aspx', params=data, headers=self.login.headers)

self.login.headers['Referer'] = response.url #每一次请求的Headers里面的referer都是当前页面的URL
selector = etree.HTML(response.text)
self.set_view_state(selector, 'xsxjs_form')

return selector

这里用到了set_view_state修改Headers里面的__VIEWSTATE,分析源码发现,这个值总是出现在table的第三个隐藏input里面,后面选课的时候,要发送这个值

1
2
3
def set_view_state(self, selector, from_name):
self.__VIEWSTATE = selector.xpath('//*[@id="' + from_name + '"]/input[3]/@value')[0]
return self.__VIEWSTATE

获取该课程的所有信息

查看网页源码,跟之前完全类似的table结果,获取信息后储存到Lesson类中即可

1
2
3
4
5
6
7
8
9
10
11
def get_all_information_of_lesson(selector, lesson):
lesson_list = []
lessons_tag_list = selector.xpath('//table[@id="xjs_table"]/tr[1]/following-sibling::tr')
for lesson_tag in lessons_tag_list:
teacher = lesson_tag.xpath('td[2]/a/text()')[0]
time = lesson_tag.xpath('td[6]/text()')[0]
surplus = int(lesson_tag.xpath('td[12]/text()')[0]) - int(lesson_tag.xpath('td[15]/text()')[0])
code = lesson_tag.xpath('td[16]/input/@value')[0]
lesson = Lesson.Lesson(code, lesson.name, lesson.code, teacher, time, str(surplus))
lesson_list.append(lesson)
return lesson_list

选课

手动模拟选课,查看POST的数据,比公选课简单的数据少一点

  • ‘__EVENTTARGET’: ‘Button1’ 就是值为“选定”的那个button
  • ‘__VIEWSTATE’: 前面获取到的那个东西,每发送一次请求,就会更新一次值
  • ‘xkkh’: 这是课程里具体选择的那个老师的课的课程号,在前面获取所有信息的时候,已经储存在Lesson类的code信息里了

herders的referer还是要更新为当前发送请求的页面URL

1
2
3
4
5
6
7
8
9
10
def select_lesson(self, lesson):
data = {
'__EVENTTARGET': 'Button1',
'__VIEWSTATE': self.__VIEWSTATE,
'xkkh': lesson.code
}
response = self.login.s.post(self.login.headers['Referer'], data=data, headers=self.login.headers)
selector = etree.HTML(response.text)
Lesson.show_error(selector)
self.set_view_state(selector, 'xsxjs_form')

用到的show_error是Lesson模块的方法,用Xpath定位页面的script,再用正则表达式匹配aleret信息,给出“现在不是选课时间”、“选课时间冲突”这样的提示

1
2
3
4
5
6
7
def show_error(selector):
error_tags = selector.xpath('/html/head/script/text()')
for error_tag in error_tags:
if error_tag:
r = "alert\('(.+?)'\);"
for s in re.findall(r, error_tag):
print('\n' + s)

完整代码

PlannedCourseSpider.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import Login as Lg
from lxml import etree
import Lesson
import re


def get_all_lesson(selector):
lesson_list = []
lessons_tag_list = selector.xpath('//table[@id="kcmcgrid"]/tr[1]/following-sibling::tr')
for lesson_tag in lessons_tag_list:
num = lesson_tag.xpath('td[1]/a/text()')
# 检验是否为无效的tr标签
if num:
num = num[0]
else:
break
code = lesson_tag.xpath('td[1]/a/@onclick')[0]
pattern = "xkkh(.+?)xh"
code = re.search(pattern, code).group()[5:-3]
lesson_name = lesson_tag.xpath('td[2]/a/text()')[0]
surplus = lesson_tag.xpath('td[10]/text()')[0]
lesson = Lesson.Lesson(num, lesson_name, code, "待选择", "待选择", surplus)
lesson_list.append(lesson)
return lesson_list


def get_all_information_of_lesson(selector, lesson):
lesson_list = []
lessons_tag_list = selector.xpath('//table[@id="xjs_table"]/tr[1]/following-sibling::tr')
for lesson_tag in lessons_tag_list:
# num = lesson_tag.xpath('td[1]/text()')[0]
teacher = lesson_tag.xpath('td[2]/a/text()')[0]
time = lesson_tag.xpath('td[6]/text()')[0]
surplus = int(lesson_tag.xpath('td[12]/text()')[0]) - int(lesson_tag.xpath('td[15]/text()')[0])
code = lesson_tag.xpath('td[16]/input/@value')[0]
lesson = Lesson.Lesson(code, lesson.name, lesson.code, teacher, time, str(surplus))
lesson_list.append(lesson)
return lesson_list


def show_and_select_lessons(lesson_list):
for i in range(len(lesson_list)):
print(i, end=' ')
lesson_list[i].show()
select_id = int(input("请输入想选的课的id,id为每门课程开头的数字,如果没有课程显示,代表没有获取到计划内课程: "))
while select_id < 0 or len(lesson_list) <= select_id:
select_id = int(input("错误的ID,请重新输入: "))
return select_id


class PlannedCourseSpider:
def __init__(self, stu_number, stu_password, stu_name):
self.number = stu_number
self.password = stu_password
self.name = stu_name
self.login = Lg.LoginSpider(stu_number, stu_password)
self.url = 'http://xk.zucc.edu.cn/'
self.__VIEWSTATE = ''

def hello_zf(self):
while not self.login.login_ocr():
continue

data = {
'xh': self.number,
'xm': self.name.encode('gb2312'),
'gnmkdm': 'N121103',
}

self.login.headers['Referer'] = self.url + 'xs_main.aspx?xh=' + self.login.number
response = self.login.s.get(self.url + 'xsxk.aspx', params=data, headers=self.login.headers)

self.login.headers['Referer'] = response.url
selector = etree.HTML(response.text)
# self.set_view_state(selector, 'xsxk_form')

return selector

def hello_lesson(self, xkkh):
data = {
'xh': self.number,
'xkkh': xkkh,
}

response = self.login.s.get(self.url + 'xsxjs.aspx', params=data, headers=self.login.headers)

self.login.headers['Referer'] = response.url
selector = etree.HTML(response.text)
self.set_view_state(selector, 'xsxjs_form')

return selector

def select_lesson(self, lesson):
data = {
'__EVENTTARGET': 'Button1',
'__VIEWSTATE': self.__VIEWSTATE,
'xkkh': lesson.code
}
response = self.login.s.post(self.login.headers['Referer'], data=data, headers=self.login.headers)
selector = etree.HTML(response.text)
Lesson.show_error(selector)
self.set_view_state(selector, 'xsxjs_form')

def set_view_state(self, selector, from_name):
self.__VIEWSTATE = selector.xpath('//*[@id="' + from_name + '"]/input[3]/@value')[0]
return self.__VIEWSTATE

def run(self):
selector = self.hello_zf()
lesson_list = get_all_lesson(selector)
select_id = show_and_select_lessons(lesson_list)
selector = self.hello_lesson(lesson_list[select_id].code)
information_list = get_all_information_of_lesson(selector, lesson_list[select_id])
choose_id = show_and_select_lessons(information_list)
while True:
self.select_lesson(information_list[choose_id])


if __name__ == "__main__":
number, password, name = Lg.get_information()
spider = PlannedCourseSpider(number, password, name)
spider.run()

Lesson.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
import re


def show_error(selector):
error_tags = selector.xpath('/html/head/script/text()')
for error_tag in error_tags:
if error_tag:
r = "alert\('(.+?)'\);"
for s in re.findall(r, error_tag):
print('\n' + s)


class Lesson:
def __init__(self, num, name, code, teacher_name, time, surplus):
self.number = num
self.name = name
self.code = code
self.teacher_name = teacher_name
self.time = time
self.surplus = surplus

def show(self):
print('课程代码:' + self.number
+ '\t课程名:' + self.name
# + '\t课程码:' + self.code[10:-3]
+ '\t教师:' + self.teacher_name
+ '\t时间:' + self.time
+ '\t余量:' + self.surplus)

完整项目包括登录、获取信息和自动选课