1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106
| from selenium.webdriver import Edge from selenium.webdriver.common.by import By import time import xlsxwriter import math
driver = Edge() curl = 'https://www.topuniversities.com/university-rankings/university-subject-rankings/2021/arts-humanities' driver.get(curl) time.sleep(1)
for i in range(1, 59, 1): if i == 1 or i == 7: continue SubjectSel = driver.find_element(By.XPATH, '//*[@id="ranking-fillters"]/div[7]/div/div') driver.execute_script('arguments[0].click();', SubjectSel) time.sleep(1) Subject = driver.find_element(By.XPATH, '//*[@id="ranking-fillters"]/div[7]/div/div/div[2]/div['+str(i)+']') SubjectName = Subject.text driver.execute_script('arguments[0].click();', Subject) print('Select Subject: '+SubjectName) time.sleep(1) Workbook = xlsxwriter.Workbook(SubjectName+'.xlsx') Sheet = Workbook.add_worksheet() itemNumber = driver.find_element(By.XPATH, '//*[@id="_totalcountresults"]') itemNum = int(itemNumber.text) print('Total Item count in ' + SubjectName + ': ' + itemNumber.text)
rankInd = driver.find_element(By.XPATH, '//*[@id="block-tu-d8-content"]/div/article/div/div[3]/div/div[1]/div/div[1]/div/div/ul/li[2]/a') driver.execute_script('arguments[0].click();', rankInd) time.sleep(1) print('Change tab to Ranking Indicators')
dropdown = driver.find_element(By.XPATH, '//*[@id="block-tu-d8-content"]/div/article/div/div[3]/div/div[1]/div/div[3]/div[4]/div[1]/div[2]/i') driver.execute_script('arguments[0].click();', dropdown) time.sleep(1) itemsPerPage = driver.find_element(By.XPATH, '//*[@id="block-tu-d8-content"]/div/article/div/div[3]/div/div[1]/div/div[3]/div[4]/div[1]/div[2]/div[2]/div[4]') driver.execute_script('arguments[0].click();', itemsPerPage) time.sleep(1) print('Now there are 100 items in every page')
Sheet.write(0, 0, 'Rank') Sheet.write(0, 1, 'University') Sheet.write(0, 2, 'Location') Sheet.write(0, 3, 'Overall Score') Sheet.write(0, 4, 'H-index Citations') Sheet.write(0, 5, 'Citations per Paper') Sheet.write(0, 6, 'Academic Reputation') Sheet.write(0, 7, 'Employer Reputation')
CycleCnt = int(math.ceil(itemNum/100))
currentRow = 1 for j in range(CycleCnt): k = 1 while True: try: eachItem = driver.find_element(By.XPATH, '//*[@id="ranking-data-load_ind"]/div['+str(k)+']') except: break attr = eachItem.get_attribute('customblock') time.sleep(0.5) if not attr: subItem = driver.find_element(By.XPATH, '//*[@id="ranking-data-load_ind"]/div['+str(k)+']/div/div/div/div[1]/div/div/div/div/div[1]/div') Sheet.write(currentRow, 0, subItem.text) subItem = driver.find_element(By.XPATH, '//*[@id="ranking-data-load_ind"]/div['+str(k)+']/div/div/div/div[1]/div/div/div/div/div[2]/div/div[1]/div') Sheet.write(currentRow, 1, subItem.text) subItem = driver.find_element(By.XPATH, '//*[@id="ranking-data-load_ind"]/div['+str(k)+']/div/div/div/div[1]/div/div/div/div/div[2]/div/div[2]') Sheet.write(currentRow, 2, subItem.text) for q in range(3,8,1): subItem = driver.find_element(By.XPATH, '//*[@id="ranking-data-load_ind"]/div['+str(k)+']/div/div/div/div[2]/div/div/div/div['+str(q-2)+']') Sheet.write(currentRow, q, subItem.text) print(str(currentRow)+ '/' + str(itemNum) + ' finished!') currentRow = currentRow + 1 k = k + 1 if j < CycleCnt-1: q = 3 while True: try: nextPage = driver.find_element(By.XPATH, '//*[@id="alt-style-pagination"]/li['+str(q)+']/a') except: q = q + 1 continue attr = nextPage.get_attribute('class') if attr == 'page-link next': break q = q + 1 driver.execute_script('arguments[0].click();', nextPage) time.sleep(1) print('chaneg to page ' + str(j+2)) print('finish ' + SubjectName) Workbook.close() driver.close()
|