import csv import requests from lxml import etree class YXZS: def __init__(self) -> None: self.url = "https://yz.chsi.com.cn/sch/?start={}" self.file = open("./data.csv", "+a", encoding="utf-8", newline="") self.file.write("学校名称,学校性质,招生简章的链接,调剂办法的链接\n") def get_html(self, page=1): resp = requests.get( self.url.format((page - 1) * 20), headers={ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/140.0.0.0 Safari/537.36 Edg/140.0.0.0", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7", "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6", }, ) if resp.status_code == 200: return resp.text return "" def parse_html(self, html_text: str): html = etree.HTML(html_text) schs = html.xpath('//div[@class="sch-item"]') for sch in schs: item = { "name": ( sch.xpath( './/a[@class="name js-yxk-yxmc text-decoration-none"]/text()' )[0].strip() if sch.xpath( './/a[@class="name js-yxk-yxmc text-decoration-none"]/text()' ) else "unkonwn" ), "tag": ( sch.xpath('.//span[@class="sch-tag"]/text()')[0].strip() if sch.xpath('.//span[@class="sch-tag"]/text()') else "unkonwn" ), "href1": ( "https://yz.chsi.com.cn" + sch.xpath('.//div[@class="sch-link"]/a[2]/@href')[0] if sch.xpath('.//div[@class="sch-link"]/a[2]/@href') else "unkonwn" ), "href2": ( "https://yz.chsi.com.cn" + sch.xpath('.//div[@class="sch-link"]/a[4]/@href')[0] if sch.xpath('.//div[@class="sch-link"]/a[4]/@href') else "unkonwn" ), } self.save_data(item) def save_data(self, item): writer = csv.DictWriter(self.file, fieldnames=item.keys()) writer.writerow(item) def run(self): for i in range(0, 47): print("正在获取第{}页".format(i + 1)) html_text = self.get_html(page=i) if html_text == "": print("第{}页获取的内容为空") continue self.parse_html(html_text) if __name__ == "__main__": YXZS().run()