US News大学排名数据获取

US News的大学排名数据获取有一点点复杂。US News 2022 Best Global Universities Rankings网页是动态刷新的，数据量也比较大。数据获取总共分为三步：

  flowchart LR
    id1(获取大学基本信息)-->id2(获取每所大学的详细信息)-->id3(各学科各大学排名指标)

获取基本信息

在不断下拉的过程中打开F12调试工具，可以看到有一个“search？format=json&page=”开头的包，这个包的响应里面就有一些大学的基本信息。每个响应包里有10所大学的信息。

再看这个包的标头，就可以找到它的“请求URL”，我们也只要发这个请求URL，就可以获取相应的响应包。总共有2005所大学，所以只要循环201次即可。

#encoding=utf-8
import requests
import time

fp = open('collegeInfo.txt', 'w', encoding='utf-8')
headers = {
    'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.54 Safari/537.36 Edg/95.0.1020.40'
}

for page in range(1, 202, 1):
    response = requests.get(url='https://www.usnews.com/education/best-global-universities/search?format=json&page='+str(page),headers=headers)
    if response.status_code == 200:
        re_text = response.json()
        li = re_text.get("items")
        for i in li:
            strConcat = i['city'] + ';' + i['country_name'] + ';' + str(i['id']) + ';' + i['name'] + ';' + str(i['ranks'][0]['value']) + ';' + str(i["stats"][0]['value']) + ';' + str(i['stats'][1]['value']) + ';' + i['url']
            print(strConcat)
            fp.writelines(strConcat)
        time.sleep(3)
    else:
        print('response error')
        break
fp.close()

获取详细信息

上面的代码获取了几所大学的基本信息，我保存到了一个txt文件“collegeInfo.txt”中，然后把它导入到excel里，分隔符选择分号，就可以得到几所大学的学校名字和网页链接。把校名和网页链接分别保存到两个文本文件“collegeNames.txt”和“collegeLinks.txt”中，用于后续获取大学的详细信息。
详细信息包括哪些内容呢？以哈佛大学为例：

包括上面的“University Data”和“Rankings”，基本上每个大学都有，这两部分数据，只是有些数据项可能有欠缺。所以针对每个大学我都从一个空的字典开始往里填充数据，一所大学对应一个Dataframe，最后把所有大学的Dataframe Concat起来就好了。

# _._ coding:utf-8 _._#
import lxml
from lxml import etree
import requests
import time
import pandas as pd
import random

fp_name = open('collegeNames.txt', 'r', encoding='utf-8')
fp_links = open('collegeLinks.txt', 'r', encoding='utf-8')

dfli = []

headers = {
    'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.54 Safari/537.36 Edg/95.0.1020.40'
}
cnt = 1
while True:
    schoolName = fp_name.readline()
    schoolName = schoolName.split('\n')[0]
    if not schoolName:
        break
    schoolUrl = fp_links.readline()

    dataDict = {'University':schoolName,
            'Total number of students':'',
            'Number of international students':'', 
            'Total number of academic staff':'', 
            'Number of international staff':'', 
            'Number of undergraduate degrees awarded':'', 
            "Number of master's degrees awarded":'', 
            'Number of doctoral degrees awarded':'', 
            'Number of research only staff':'',
            'Number of new undergraduate students':'', 
            "Number of new master's students":'', 
            'Number of new doctoral students':'',
            'Best Global Universities':'',
            'Best Global Universities in Region':'',
            'Best Global Universities in Country':'',
            'Agricultural Sciences':'',
            'Arts and Humanities':'', 
            'Biology and Biochemistry':'',
            'Biotechnology and Applied Microbiology':'',
            'Cardiac and Cardiovascular Systems':'',
            'Cell Biology':'',
            'Chemical Engineering':'',
            'Chemistry':'',
            'Civil Engineering':'',
            'Clinical Medicine':'',
            'Computer Science':'',
            'Condensed Matter Physics':'',
            'Economics and Business':'',
            'Electrical and Electronic Engineering':'',
            'Endocrinology and Metabolism':'',
            'Energy and Fuels':'',
            'Engineering':'',
            'Environment/Ecology':'',
            'Food Science and Technology':'',
            'Gastroenterology and Hepatology':'',
            'Geosciences':'',
            'Immunology':'',
            'Infectious Diseases':'',
            'Materials Science':'',
            'Mathematics':'',
            'Mechanical Engineering':'',
            'Microbiology':'',
            'Molecular Biology and Genetics':'',
            'Nanoscience and Nanotechnology':'',
            'Neuroscience and Behavior':'',
            'Oncology':'',
            'Optics':'',
            'Pharmacology and Toxicology':'',
            'Physical Chemistry':'',
            'Physics':'',
            'Plant and Animal Science':'',
            'Polymer Science':'',
            'Psychiatry/Psychology':'',
            'Public, Environmental and Occupational Health':'',
            'Radiology, Nuclear Medicine and Medical Imaging':'',
            'Social Sciences and Public Health':'',
            'Space Science':'',
            'Surgery':''}

    response = requests.get(url=schoolUrl, headers=headers)

    if response.status_code == 200:
        html = response.text.encode('utf-8')
        result = etree.HTML(html)


        content = result.xpath('//*[@id="uniData"]/div/div')

        for subItem in range(len(content)):
            label = content[subItem].xpath('./p[1]')[0].text
            value = content[subItem].xpath('./p[2]')[0].text
            valueSplit = value.split(',')
            valueConcat = ''
            for i in range(len(valueSplit)):
                valueConcat = valueConcat + valueSplit[i]

            print(str(cnt) + ' ' + schoolName + ' ' + label + ': ' + valueConcat)
            dataDict[label] = valueConcat

        content = result.xpath('//*[@id="rankings"]/div')
        for i in range(len(content)):
            if content[i].attrib['class'] == 'mb5':
                subUl = content[i].xpath('./div/ul')
            elif content[i].attrib['class'] == 'subject-rankings':
                subUl = content[i].xpath('./ul')
            else:
                break

            for subLi in subUl:
                subLiObj = subLi.xpath('./li')
                for items in subLiObj:
                    rankValue = items.xpath('./a/div/strong/text()')
                    if rankValue[0] == '#':
                        rank = rankValue[1]
                    else:
                        rank = rankValue[0]
                    label = items.xpath('./a/strong')[0].text
                    if label in dataDict.keys():
                        dataDict[label] = rank
                    elif ((label == 'Best Global Universities in Asia') |
                          (label == 'Best Global Universities in Africa') |
                          (label == 'Best Global Universities in Australia/New Zealand') |
                          (label == 'Best Global Universities in Europe') |
                          (label == 'Best Global Universities in Latin America')):
                        dataDict['Best Global Universities in Region'] = rank
                    else:
                        dataDict['Best Global Universities in Country'] = rank
                    print(str(cnt) + ' ' + schoolName + ' ' + label + ': ' + rank)

        df = pd.DataFrame(dataDict, index=[0])       
        dfli.append(df)
        cnt = cnt + 1
    else:
        break

    time.sleep(random.random()*2+1)
    
dfli = pd.concat(dfli)
dfli.to_excel('UniversityDetail.xlsx', index=False)

各学科各大学排名指标

不同的学科也有各自的排名，大致数据如下：

遍历每个大学的网页的时候，可以在每个网页中找到多个学科的排名指标，为每个学科设置一个Dataframe的列表，然后再最后Concat起来，再写入excel中就好了。

from lxml import etree
import requests
import time
import re
import pandas
import random

from requests.api import head

fpDict = {'Global Universities': 0,
          'Agricultural Sciences': 1,
          'Arts and Humanities':2,
          'Biology and Biochemistry':3,
          'Biotechnology and Applied Microbiology':4,
          'Cardiac and Cardiovascular Systems':5,
          'Cell Biology':6,
          'Chemical Engineering':7,
          'Chemistry':8,
          'Civil Engineering':9,
          'Clinical Medicine':10,
          'Computer Science':11,
          'Condensed Matter Physics':12,
          'Economics and Business':13,
          'Electrical and Electronic Engineering':14,
          'Endocrinology and Metabolism':15,
          'Energy and Fuels':16,
          'Engineering':17,
          'Environment/Ecology':18,
          'Food Science and Technology':19,
          'Gastroenterology and Hepatology':20,
          'Geosciences':21,
          'Immunology':22,
          'Infectious Diseases':23, 
          'Materials Science':24,
          'Mathematics':25,
          'Mechanical Engineering':26,
          'Microbiology':27,
          'Molecular Biology and Genetics':28,
          'Nanoscience and Nanotechnology':29,
          'Neuroscience and Behavior':30,
          'Oncology':31,
          'Optics':32,
          'Pharmacology and Toxicology':33,
          'Physical Chemistry':34,
          'Physics':35,
          'Plant and Animal Science':36,
          'Polymer Science':37,
          'Psychiatry/Psychology':38,
          'Public, Environmental and Occupational Health':39,
          'Radiology, Nuclear Medicine and Medical Imaging':40,
          'Social Sciences and Public Health':41,
          'Space Science':42,
          'Surgery':43}

fp_name = open('collegeNames.txt', 'r', encoding='utf-8')
fp_country = open('collegeCountries.txt', 'r', encoding='utf-8')
fp_links = open('collegeLinks.txt', 'r', encoding='utf-8')

dfli0  = []
dfli1  = []
dfli2  = []
dfli3  = []
dfli4  = []
dfli5  = []
dfli6  = []
dfli7  = []
dfli8  = []
dfli9  = []
dfli10 = []
dfli11 = []
dfli12 = []
dfli13 = []
dfli14 = []
dfli15 = []
dfli16 = []
dfli17 = []
dfli18 = []
dfli19 = []
dfli20 = []
dfli21 = []
dfli22 = []
dfli23 = []
dfli24 = []
dfli25 = []
dfli26 = []
dfli27 = []
dfli28 = []
dfli29 = []
dfli30 = []
dfli31 = []
dfli32 = []
dfli33 = []
dfli34 = []
dfli35 = []
dfli36 = []
dfli37 = []
dfli38 = []
dfli39 = []
dfli40 = []
dfli41 = []
dfli42 = []
dfli43 = []

headers = {
    'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.54 Safari/537.36 Edg/95.0.1020.40'
}
cnt = 1
while True:
    schoolName = fp_name.readline()
    schoolName = schoolName.split('\n')[0]
    if not schoolName:
        break

    schoolCountry = fp_country.readline()
    schoolCountry = schoolCountry.split('\n')[0]
    schoolUrl = fp_links.readline()

    response = requests.get(url=schoolUrl, headers=headers)

    if response.status_code == 200:
        html = response.text.encode('utf-8')
        result = etree.HTML(html)

        content = result.xpath('//*[@id="indicators"]/div[2]/div')
        for box in content:
            category = box.xpath('./button/h3')[0].text
            rank = box.xpath('./div/ul/li/a/div/strong/text()')
            if rank[0] == '#':
                rankValue = rank[1]
            else:
                rankValue = rank[0]
            indDict = {'University':schoolName, 
                        'Country':schoolCountry,
                        'Rank':rankValue,
                        'overall score':'',
                        'global research reputation':'',
                        'regional research reputation':'',
                        'publications':'',
                        'books':'',
                        'conferences':'',
                        'normalized citation impact':'',
                        'total citations':'',
                        'number of publications that are among the 10% most cited':'',
                        'percentage of total publications that are among the 10% most cited':'',
                        'international collaboration - relative to country':'',
                        'international collaboration':'',
                        'number of highly cited papers that are among the top 1% most cited':'',
                        'percentage of highly cited papers that are among the top 1% most cited':''}
            fpNum = fpDict.get(category)
            dfli = eval('dfli'+str(fpNum))

            allInd =  box.xpath('./div/div')
            for eachInd in allInd:
                label = eachInd.xpath('./p[1]')[0].text
                value = eachInd.xpath('./p[2]')[0].text
                label = re.sub(category+' ', '', label)
                label = label.lower()
                value = re.sub('#', '', value)
                if ((label == 'global score') | (label == 'overall score')):
                    indDict['overall score'] = value
                else:
                    indDict[label] = value
                print(str(cnt) + ' | ' + schoolName + ' | ' + category + ' | ' + label + ': ' + value)

            df = pandas.DataFrame(indDict, index=[0])
            dfli.append(df)
    time.sleep(random.random()*2+1)
    cnt = cnt + 1

dfli0=pandas.concat(dfli0)
dfli1=pandas.concat(dfli1)
dfli2=pandas.concat(dfli2)
dfli3=pandas.concat(dfli3)
dfli4=pandas.concat(dfli4)
dfli5=pandas.concat(dfli5)
dfli6=pandas.concat(dfli6)
dfli7=pandas.concat(dfli7)
dfli8=pandas.concat(dfli8)
dfli9=pandas.concat(dfli9)
dfli10=pandas.concat(dfli10)
dfli11=pandas.concat(dfli11)
dfli12=pandas.concat(dfli12)
dfli13=pandas.concat(dfli13)
dfli14=pandas.concat(dfli14)
dfli15=pandas.concat(dfli15)
dfli16=pandas.concat(dfli16)
dfli17=pandas.concat(dfli17)
dfli18=pandas.concat(dfli18)
dfli19=pandas.concat(dfli19)
dfli20=pandas.concat(dfli20)
dfli21=pandas.concat(dfli21)
dfli22=pandas.concat(dfli22)
dfli23=pandas.concat(dfli23)
dfli24=pandas.concat(dfli24)
dfli25=pandas.concat(dfli25)
dfli26=pandas.concat(dfli26)
dfli27=pandas.concat(dfli27)
dfli28=pandas.concat(dfli28)
dfli29=pandas.concat(dfli29)
dfli30=pandas.concat(dfli30)
dfli31=pandas.concat(dfli31)
dfli32=pandas.concat(dfli32)
dfli33=pandas.concat(dfli33)
dfli34=pandas.concat(dfli34)
dfli35=pandas.concat(dfli35)
dfli36=pandas.concat(dfli36)
dfli37=pandas.concat(dfli37)
dfli38=pandas.concat(dfli38)
dfli39=pandas.concat(dfli39)
dfli40=pandas.concat(dfli40)
dfli41=pandas.concat(dfli41)
dfli42=pandas.concat(dfli42)
dfli43=pandas.concat(dfli43)
dfli0.to_excel('Global Universities.xlsx', index=False)
dfli1.to_excel('Agricultural Sciences.xlsx', index=False)
dfli2.to_excel('Arts and Humanitie.xlsx', index=False)
dfli3.to_excel('Biology and Biochemistr.xlsx', index=False)
dfli4.to_excel('Biotechnology and Applied Microbiolog.xlsx', index=False)
dfli5.to_excel('Cardiac and Cardiovascular System.xlsx', index=False)
dfli6.to_excel('Cell Biolog.xlsx', index=False)
dfli7.to_excel('Chemical Engineerin.xlsx', index=False)
dfli8.to_excel('Chemistr.xlsx', index=False)
dfli9.to_excel('Civil Engineerin.xlsx', index=False)
dfli10.to_excel('Clinical Medicine.xlsx', index=False)
dfli11.to_excel('Computer Science.xlsx', index=False)
dfli12.to_excel('Condensed Matter Physics.xlsx', index=False)
dfli13.to_excel('Economics and Business.xlsx', index=False)
dfli14.to_excel('Electrical and Electronic Engineering.xlsx', index=False)
dfli15.to_excel('Endocrinology and Metabolism.xlsx', index=False)
dfli16.to_excel('Energy and Fuels.xlsx', index=False)
dfli17.to_excel('Engineering.xlsx', index=False)
dfli18.to_excel('Environment_Ecology.xlsx', index=False)
dfli19.to_excel('Food Science and Technology.xlsx', index=False)
dfli20.to_excel('Gastroenterology and Hepatology.xlsx', index=False)
dfli21.to_excel('Geosciences.xlsx', index=False)
dfli22.to_excel('Immunology.xlsx', index=False)
dfli23.to_excel('Infectious Diseases.xlsx', index=False)
dfli24.to_excel('Materials Science.xlsx', index=False)
dfli25.to_excel('Mathematics.xlsx', index=False)
dfli26.to_excel('Mechanical Engineering.xlsx', index=False)
dfli27.to_excel('Microbiology.xlsx', index=False)
dfli28.to_excel('Molecular Biology and Genetics.xlsx', index=False)
dfli29.to_excel('Nanoscience and Nanotechnology.xlsx', index=False)
dfli30.to_excel('Neuroscience and Behavior.xlsx', index=False)
dfli31.to_excel('Oncology.xlsx', index=False)
dfli32.to_excel('Optics.xlsx', index=False)
dfli33.to_excel('Pharmacology and Toxicology.xlsx', index=False)
dfli34.to_excel('Physical Chemistry.xlsx', index=False)
dfli35.to_excel('Physics.xlsx', index=False)
dfli36.to_excel('Plant and Animal Science.xlsx', index=False)
dfli37.to_excel('Polymer Science.xlsx', index=False)
dfli38.to_excel('Psychiatry_Psychology.xlsx', index=False)
dfli39.to_excel('Public, Environmental and Occupational Health.xlsx', index=False)
dfli40.to_excel('Radiology, Nuclear Medicine and Medical Imaging.xlsx', index=False)
dfli41.to_excel('Social Sciences and Public Health.xlsx', index=False)
dfli42.to_excel('Space Science.xlsx', index=False)
dfli43.to_excel('Surgery.xlsx', index=False)