THE大学排名

  THE大学排名的数据比较容易获取,THE大学排名2022,所有数据都可以在这一个网页中找到。

  • “any subject”下拉菜单中可以选择不同学科,如果不选的话那就是综合排名;
  • 不需要翻页,一页就是一个学科;
  • 每种学科(包括综合排名)排名都可以写入一个excel中;
  • 每种学科排名有两个标签栏中的数据需要获取,一个是“Rankings”,一个是“Scores”。

  遇到的问题是,“any subject”的“select”是不可见的,所以不能用selemium的Select方法,解决方法就是用js脚本让它显示出来:

1
2
3
4
#get select object and make it visible
sel = Select(driver.find_element(By.XPATH, '//*[@id="subjects"]'))
js = 'document.querySelectorAll("select")[3].style.display="block";'
driver.execute_script(js)

  用这个方法会让网页变得有点鬼畜,但确实是有效的

  下面是完整的脚本:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
#encoding=utf-8
# THE ranks 2022 for all subjects
from re import I
from selenium.webdriver import Edge
from selenium.webdriver.common.by import By
import time
import xlsxwriter
from selenium.webdriver.support.ui import Select

driver = Edge()
curl = 'https://www.timeshighereducation.com/world-university-rankings/2022/world-ranking#!/page/0/length/-1/sort_by/rank/sort_order/asc/cols/stats'
driver.get(curl)
time.sleep(1)
#get object used to change to status tab
ch2status = driver.find_element(By.XPATH, '//*[@id="stats"]')
#get object used to change to score tab
ch2score = driver.find_element(By.XPATH, '//*[@id="scores"]')
#get select object and make it visible
sel = Select(driver.find_element(By.XPATH, '//*[@id="subjects"]'))
js = 'document.querySelectorAll("select")[3].style.display="block";'
driver.execute_script(js)

for q in range(1, 33, 1):
#select subject
sel.select_by_index(q-1)
#get current select name
Opt = driver.find_element(By.XPATH, '//*[@id="subjects"]/option['+str(q)+']')
SubjectName = Opt.text
print(SubjectName + ' start')
Workbook = xlsxwriter.Workbook(SubjectName+'.xlsx')
Sheet = Workbook.add_worksheet()

Sheet.write(0, 0, 'Rank')
Sheet.write(0, 1, 'University')
Sheet.write(0, 2, 'Location')
Sheet.write(0, 3, 'No. of FTE Students')
Sheet.write(0, 4, 'No. of Students per Staff')
Sheet.write(0, 5, 'International Students')
Sheet.write(0, 6, 'Female:Male Ratio')
Sheet.write(0, 7, 'Overall')
Sheet.write(0, 8, 'Teaching')
Sheet.write(0, 9, 'Research')
Sheet.write(0, 10, 'Citations')
Sheet.write(0, 11, 'Industry Income')
Sheet.write(0, 12, 'International Outlook')

currentRow = 1
while True:
try:
driver.find_element(By.XPATH, '//*[@id="datatable-1"]/tbody/tr['+str(currentRow)+']')
except:
break
subItem = driver.find_element(By.XPATH, '//*[@id="datatable-1"]/tbody/tr['+str(currentRow)+']/td[1]')
Sheet.write(currentRow, 0, subItem.text)
subItem = driver.find_element(By.XPATH, '//*[@id="datatable-1"]/tbody/tr['+str(currentRow)+']/td[2]/a')
Sheet.write(currentRow, 1, subItem.text)
subItem = driver.find_element(By.XPATH, '//*[@id="datatable-1"]/tbody/tr['+str(currentRow)+']/td[2]/div/div/span/a')
Sheet.write(currentRow, 2, subItem.text)
for k in range(3,7,1):
subItem = driver.find_element(By.XPATH, '//*[@id="datatable-1"]/tbody/tr['+str(currentRow)+']/td['+str(k)+']')
Sheet.write(currentRow, k, subItem.text)
print(SubjectName + ': ' + str(currentRow) + ' Status finished!')
currentRow = currentRow + 1

totalItem = currentRow
print('total Item of '+SubjectName+' is '+str(totalItem))

driver.execute_script('arguments[0].click();', ch2score)

for i in range(1, totalItem, 1):
for k in range(3,9,1):
subItem = driver.find_element(By.XPATH, '//*[@id="datatable-1"]/tbody/tr['+str(i)+']/td['+str(k)+']')
Sheet.write(i, k+4, subItem.text)
print(SubjectName + ': ' + str(i)+'/'+str(totalItem-1)+' Score finished!')

driver.execute_script('arguments[0].click();', ch2status)

Workbook.close()

driver.close()

软科大学排名

世界大学学术排名

  软科的排名数据也比较容易获取,软科世界大学学术排名2021

  • 总共1000条记录,需要翻页
  • 每个大学的具体指标需要下拉栏选择

  翻页按键的XPATH会根据页数不同发生变化,所以采用了动态搜寻的方法;每次先把当前页面的数据拿到,然后逐个更换指标依次获取所有指标。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
#encoding=utf-8
# 软科世界大学学术排名 2021
from selenium.webdriver import Edge
from selenium.webdriver.common.by import By
import time
import xlsxwriter

Workbook = xlsxwriter.Workbook("软科世界大学学术排名_2021.xlsx")
Sheet = Workbook.add_worksheet()
driver = Edge()

Sheet.write(0, 0, '排名')
Sheet.write(0, 1, '学校名称')
Sheet.write(0, 2, '国家/地区')
Sheet.write(0, 3, '国家/地区排名')
Sheet.write(0, 4, '总分')
Sheet.write(0, 5, '校友获奖')
Sheet.write(0, 6, '教师获奖')
Sheet.write(0, 7, '高被引科学家')
Sheet.write(0, 8, 'N&S论文')
Sheet.write(0, 9, '国际论文')
Sheet.write(0, 10, '师均表现')

curl = 'https://www.shanghairanking.cn/rankings/arwu/2021'
driver.get(curl)

time.sleep(1)
lastRow = 1
for page in range(34):#34 page
print('page ' + str(page+1))
currentRow = lastRow
for itemIndx in range(1, 31, 1):
try:
subItem = driver.find_element(By.XPATH, '//*[@id="content-box"]/div[2]/table/tbody/tr['+str(itemIndx)+']')
except:
break
subItem = driver.find_element(By.XPATH, '//*[@id="content-box"]/div[2]/table/tbody/tr['+str(itemIndx)+']/td[1]/div')
Sheet.write(currentRow, 0, subItem.text)
subItem = driver.find_element(By.XPATH, '//*[@id="content-box"]/div[2]/table/tbody/tr['+str(itemIndx)+']/td[2]/div/div[2]/div')
Sheet.write(currentRow, 1, subItem.text)
subItem = driver.find_element(By.XPATH, '//*[@id="content-box"]/div[2]/table/tbody/tr['+str(itemIndx)+']/td[3]')
Sheet.write(currentRow, 2, subItem.text)
subItem = driver.find_element(By.XPATH, '//*[@id="content-box"]/div[2]/table/tbody/tr['+str(itemIndx)+']/td[4]')
Sheet.write(currentRow, 3, subItem.text)
subItem = driver.find_element(By.XPATH, '//*[@id="content-box"]/div[2]/table/tbody/tr['+str(itemIndx)+']/td[5]')
Sheet.write(currentRow, 4, subItem.text)
currentRow = currentRow + 1

for scoreIndx in range(1, 7, 1):
currentRow = lastRow
scoreSel = driver.find_element(By.XPATH, '//*[@id="content-box"]/div[2]/table/thead/tr/th[6]/div/div[1]/div[1]')
driver.execute_script('arguments[0].click();', scoreSel)
time.sleep(1)

score = driver.find_element(By.XPATH, '//*[@id="content-box"]/div[2]/table/thead/tr/th[6]/div/div[1]/div[2]/ul/li['+str(scoreIndx)+']')
scoreName = score.text
driver.execute_script('arguments[0].click();', score)
time.sleep(1)
# print(scoreName)

for itemIndx in range(1, 31, 1):
try:
subItem = driver.find_element(By.XPATH, '//*[@id="content-box"]/div[2]/table/tbody/tr['+str(itemIndx)+']')
except:
break
subItem = driver.find_element(By.XPATH, '//*[@id="content-box"]/div[2]/table/tbody/tr['+str(itemIndx)+']/td[6]')
Sheet.write(currentRow, scoreIndx+4, subItem.text)
currentRow = currentRow + 1

lastRow = currentRow

if page < 33:
nextPageLoc = 3
while True:
nextPage = driver.find_element(By.XPATH, '//*[@id="content-box"]/ul/li['+str(nextPageLoc)+']')
# //*[@id="content-box"]/ul/li[9]
attr = nextPage.get_attribute('title')
if attr == '下一页':
break
nextPageLoc = nextPageLoc + 1
driver.execute_script('arguments[0].click();', nextPage)
time.sleep(1)

Workbook.close()
driver.close()

世界一流学科排名

  世界一流学科排名对每个学科都有专门的序号,这个序号和对应的网页有关,所以只需要建立一个序号的字典,就可以依次遍历所有的学科网页。在每个网页中类似世界大学学术排名一样处理即可。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
#encoding=utf-8
#软科世界一流学科排名 2021
from selenium.webdriver import Edge
from selenium.webdriver.common.by import By
import time
import xlsxwriter

fp = open('linksTail.txt', 'r')
linksDict = {0:'RS0101', 1:'RS0102', 2:'RS0103', 3:'RS0104', 4:'RS0105', 5:'RS0106', 6:'RS0107',
7:'RS0108', 8:'RS0201', 9:'RS0202', 10:'RS0205', 11:'RS0206', 12:'RS0207', 13:'RS0208',
14:'RS0210', 15:'RS0211', 16:'RS0212', 17:'RS0213', 18:'RS0214', 19:'RS0215', 20:'RS0216',
21:'RS0217', 22:'RS0219', 23:'RS0220', 24:'RS0221', 25:'RS0222', 26:'RS0223', 27:'RS0224',
28:'RS0226', 29:'RS0227', 30:'RS0301', 31:'RS0302', 32:'RS0303', 33:'RS0304', 34:'RS0401',
35:'RS0402', 36:'RS0403', 37:'RS0404', 38:'RS0405', 39:'RS0406', 40:'RS0501', 41:'RS0502',
42:'RS0503', 43:'RS0504', 44:'RS0505', 45:'RS0506', 46:'RS0507', 47:'RS0508', 48:'RS0509',
49:'RS0510', 50:'RS0511', 51:'RS0512', 52:'RS0513', 53:'RS0515'}

driver = Edge()

for linkNum in range(54):
url = 'https://www.shanghairanking.cn/rankings/gras/2021/'+linksDict[linkNum]
driver.get(url)
time.sleep(1)

subjectObj = driver.find_element(By.XPATH, '//*[@id="content-box"]/div[1]/div[1]/div[3]')
subjectName = subjectObj.text
print('Start of '+subjectName)
Workbook = xlsxwriter.Workbook(subjectName+'.xlsx')
Sheet = Workbook.add_worksheet()

Sheet.write(0, 0, '排名')
Sheet.write(0, 1, '学校名称')
Sheet.write(0, 2, '国家/地区')
Sheet.write(0, 3, '总分')
Sheet.write(0, 4, '重要期刊论文数')
Sheet.write(0, 5, '论文标准化影响力')
Sheet.write(0, 6, '国际合作论文比例')
Sheet.write(0, 7, '顶尖期刊论文数')
Sheet.write(0, 8, '教师获权威奖项数')

lastRow = 1
page = 1
while True:
currentRow = lastRow
for itemIndx in range(1, 31, 1):
try:
subItem = driver.find_element(By.XPATH, '//*[@id="content-box"]/div[2]/table/tbody/tr['+str(itemIndx)+']')
except:
break
subItem = driver.find_element(By.XPATH, '//*[@id="content-box"]/div[2]/table/tbody/tr['+str(itemIndx)+']/td[1]/div')
Sheet.write(currentRow, 0, subItem.text)
subItem = driver.find_element(By.XPATH, '//*[@id="content-box"]/div[2]/table/tbody/tr['+str(itemIndx)+']/td[2]/div/div[2]/div')
Sheet.write(currentRow, 1, subItem.text)
subItem = driver.find_element(By.XPATH, '//*[@id="content-box"]/div[2]/table/tbody/tr['+str(itemIndx)+']/td[3]')
Sheet.write(currentRow, 2, subItem.text)
subItem = driver.find_element(By.XPATH, '//*[@id="content-box"]/div[2]/table/tbody/tr['+str(itemIndx)+']/td[4]')
Sheet.write(currentRow, 3, subItem.text)
currentRow = currentRow + 1

for scoreIndx in range(1, 6, 1):
currentRow = lastRow
scoreSel = driver.find_element(By.XPATH, '//*[@id="content-box"]/div[2]/table/thead/tr/th[5]/div/div[1]/div[1]')
driver.execute_script('arguments[0].click();', scoreSel)
time.sleep(1)

score = driver.find_element(By.XPATH, '//*[@id="content-box"]/div[2]/table/thead/tr/th[5]/div/div[1]/div[2]/ul/li['+str(scoreIndx)+']')
scoreName = score.text
driver.execute_script('arguments[0].click();', score)
time.sleep(1)

for itemIndx in range(1, 31, 1):
try:
subItem = driver.find_element(By.XPATH, '//*[@id="content-box"]/div[2]/table/tbody/tr['+str(itemIndx)+']')
except:
break
subItem = driver.find_element(By.XPATH, '//*[@id="content-box"]/div[2]/table/tbody/tr['+str(itemIndx)+']/td[5]')
Sheet.write(currentRow, scoreIndx+3, subItem.text)
currentRow = currentRow + 1

lastRow = currentRow

print('page ' + str(page) + ' finished!')
page = page + 1

nextPageLoc = 3
while True:
nextPage = driver.find_element(By.XPATH, '//*[@id="content-box"]/ul/li['+str(nextPageLoc)+']')
attr = nextPage.get_attribute('title')
if attr == '下一页':
break
nextPageLoc = nextPageLoc + 1
attr = nextPage.get_attribute('aria-disabled')
if attr == 'true':
break
driver.execute_script('arguments[0].click();', nextPage)
time.sleep(1)

Workbook.close()
print('End of '+subjectName)

driver.close()