feat(*): 代码仓库初始化

2025-11-07 18:39:29 +08:00
commit b1aeb6b39b
36 changed files with 1700 additions and 0 deletions
--- a/Spider/.gitignore
+++ b/Spider/.gitignore
@@ -0,0 +1 @@
+data.csv
--- a/Spider/README.md
+++ b/Spider/README.md
@@ -0,0 +1,15 @@
+<!--
+ Copyright (c) 2025 zhilv
+ 
+ This software is released under the MIT License.
+ https://opensource.org/licenses/MIT
+-->
+
+## `Python` 网络爬虫基础教程
+
+### 文件介绍
+
+```sh
+.
+└── test1.py  # 获取研招网数据
+```
--- a/Spider/test1.py
+++ b/Spider/test1.py
@@ -0,0 +1,74 @@
+import csv
+import requests
+from lxml import etree
+
+
+class YXZS:
+    def __init__(self) -> None:
+        self.url = "https://yz.chsi.com.cn/sch/?start={}"
+        self.file = open("./data.csv", "+a", encoding="utf-8", newline="")
+        self.file.write("学校名称,学校性质,招生简章的链接,调剂办法的链接\n")
+
+    def get_html(self, page=1):
+        resp = requests.get(
+            self.url.format((page - 1) * 20),
+            headers={
+                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/140.0.0.0 Safari/537.36 Edg/140.0.0.0",
+                "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
+                "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
+            },
+        )
+        if resp.status_code == 200:
+            return resp.text
+        return ""
+
+    def parse_html(self, html_text: str):
+        html = etree.HTML(html_text)
+        schs = html.xpath('//div[@class="sch-item"]')
+        for sch in schs:
+            item = {
+                "name": (
+                    sch.xpath(
+                        './/a[@class="name js-yxk-yxmc text-decoration-none"]/text()'
+                    )[0].strip()
+                    if sch.xpath(
+                        './/a[@class="name js-yxk-yxmc text-decoration-none"]/text()'
+                    )
+                    else "unkonwn"
+                ),
+                "tag": (
+                    sch.xpath('.//span[@class="sch-tag"]/text()')[0].strip()
+                    if sch.xpath('.//span[@class="sch-tag"]/text()')
+                    else "unkonwn"
+                ),
+                "href1": (
+                    "https://yz.chsi.com.cn"
+                    + sch.xpath('.//div[@class="sch-link"]/a[2]/@href')[0]
+                    if sch.xpath('.//div[@class="sch-link"]/a[2]/@href')
+                    else "unkonwn"
+                ),
+                "href2": (
+                    "https://yz.chsi.com.cn"
+                    + sch.xpath('.//div[@class="sch-link"]/a[4]/@href')[0]
+                    if sch.xpath('.//div[@class="sch-link"]/a[4]/@href')
+                    else "unkonwn"
+                ),
+            }
+            self.save_data(item)
+
+    def save_data(self, item):
+        writer = csv.DictWriter(self.file, fieldnames=item.keys())
+        writer.writerow(item)
+
+    def run(self):
+        for i in range(0, 47):
+            print("正在获取第{}页".format(i + 1))
+            html_text = self.get_html(page=i)
+            if html_text == "":
+                print("第{}页获取的内容为空")
+                continue
+            self.parse_html(html_text)
+
+
+if __name__ == "__main__":
+    YXZS().run()