Course/Spider/test1.py

import csv
import requests
from lxml import etree


class YXZS:
    def __init__(self) -> None:
        self.url = "https://yz.chsi.com.cn/sch/?start={}"
        self.file = open("./data.csv", "+a", encoding="utf-8", newline="")
        self.file.write("学校名称,学校性质,招生简章的链接,调剂办法的链接\n")

    def get_html(self, page=1):
        resp = requests.get(
            self.url.format((page - 1) * 20),
            headers={
                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/140.0.0.0 Safari/537.36 Edg/140.0.0.0",
                "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
                "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
            },
        )
        if resp.status_code == 200:
            return resp.text
        return ""

    def parse_html(self, html_text: str):
        html = etree.HTML(html_text)
        schs = html.xpath('//div[@class="sch-item"]')
        for sch in schs:
            item = {
                "name": (
                    sch.xpath(
                        './/a[@class="name js-yxk-yxmc text-decoration-none"]/text()'
                    )[0].strip()
                    if sch.xpath(
                        './/a[@class="name js-yxk-yxmc text-decoration-none"]/text()'
                    )
                    else "unkonwn"
                ),
                "tag": (
                    sch.xpath('.//span[@class="sch-tag"]/text()')[0].strip()
                    if sch.xpath('.//span[@class="sch-tag"]/text()')
                    else "unkonwn"
                ),
                "href1": (
                    "https://yz.chsi.com.cn"
                    + sch.xpath('.//div[@class="sch-link"]/a[2]/@href')[0]
                    if sch.xpath('.//div[@class="sch-link"]/a[2]/@href')
                    else "unkonwn"
                ),
                "href2": (
                    "https://yz.chsi.com.cn"
                    + sch.xpath('.//div[@class="sch-link"]/a[4]/@href')[0]
                    if sch.xpath('.//div[@class="sch-link"]/a[4]/@href')
                    else "unkonwn"
                ),
            }
            self.save_data(item)

    def save_data(self, item):
        writer = csv.DictWriter(self.file, fieldnames=item.keys())
        writer.writerow(item)

    def run(self):
        for i in range(0, 47):
            print("正在获取第{}页".format(i + 1))
            html_text = self.get_html(page=i)
            if html_text == "":
                print("第{}页获取的内容为空")
                continue
            self.parse_html(html_text)


if __name__ == "__main__":
    YXZS().run()