feat(*): 代码仓库初始化
This commit is contained in:
1
Spider/.gitignore
vendored
Normal file
1
Spider/.gitignore
vendored
Normal file
@@ -0,0 +1 @@
|
||||
data.csv
|
||||
15
Spider/README.md
Normal file
15
Spider/README.md
Normal file
@@ -0,0 +1,15 @@
|
||||
<!--
|
||||
Copyright (c) 2025 zhilv
|
||||
|
||||
This software is released under the MIT License.
|
||||
https://opensource.org/licenses/MIT
|
||||
-->
|
||||
|
||||
## `Python` 网络爬虫基础教程
|
||||
|
||||
### 文件介绍
|
||||
|
||||
```sh
|
||||
.
|
||||
└── test1.py # 获取研招网数据
|
||||
```
|
||||
74
Spider/test1.py
Normal file
74
Spider/test1.py
Normal file
@@ -0,0 +1,74 @@
|
||||
import csv
|
||||
import requests
|
||||
from lxml import etree
|
||||
|
||||
|
||||
class YXZS:
|
||||
def __init__(self) -> None:
|
||||
self.url = "https://yz.chsi.com.cn/sch/?start={}"
|
||||
self.file = open("./data.csv", "+a", encoding="utf-8", newline="")
|
||||
self.file.write("学校名称,学校性质,招生简章的链接,调剂办法的链接\n")
|
||||
|
||||
def get_html(self, page=1):
|
||||
resp = requests.get(
|
||||
self.url.format((page - 1) * 20),
|
||||
headers={
|
||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/140.0.0.0 Safari/537.36 Edg/140.0.0.0",
|
||||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
|
||||
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
|
||||
},
|
||||
)
|
||||
if resp.status_code == 200:
|
||||
return resp.text
|
||||
return ""
|
||||
|
||||
def parse_html(self, html_text: str):
|
||||
html = etree.HTML(html_text)
|
||||
schs = html.xpath('//div[@class="sch-item"]')
|
||||
for sch in schs:
|
||||
item = {
|
||||
"name": (
|
||||
sch.xpath(
|
||||
'.//a[@class="name js-yxk-yxmc text-decoration-none"]/text()'
|
||||
)[0].strip()
|
||||
if sch.xpath(
|
||||
'.//a[@class="name js-yxk-yxmc text-decoration-none"]/text()'
|
||||
)
|
||||
else "unkonwn"
|
||||
),
|
||||
"tag": (
|
||||
sch.xpath('.//span[@class="sch-tag"]/text()')[0].strip()
|
||||
if sch.xpath('.//span[@class="sch-tag"]/text()')
|
||||
else "unkonwn"
|
||||
),
|
||||
"href1": (
|
||||
"https://yz.chsi.com.cn"
|
||||
+ sch.xpath('.//div[@class="sch-link"]/a[2]/@href')[0]
|
||||
if sch.xpath('.//div[@class="sch-link"]/a[2]/@href')
|
||||
else "unkonwn"
|
||||
),
|
||||
"href2": (
|
||||
"https://yz.chsi.com.cn"
|
||||
+ sch.xpath('.//div[@class="sch-link"]/a[4]/@href')[0]
|
||||
if sch.xpath('.//div[@class="sch-link"]/a[4]/@href')
|
||||
else "unkonwn"
|
||||
),
|
||||
}
|
||||
self.save_data(item)
|
||||
|
||||
def save_data(self, item):
|
||||
writer = csv.DictWriter(self.file, fieldnames=item.keys())
|
||||
writer.writerow(item)
|
||||
|
||||
def run(self):
|
||||
for i in range(0, 47):
|
||||
print("正在获取第{}页".format(i + 1))
|
||||
html_text = self.get_html(page=i)
|
||||
if html_text == "":
|
||||
print("第{}页获取的内容为空")
|
||||
continue
|
||||
self.parse_html(html_text)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
YXZS().run()
|
||||
Reference in New Issue
Block a user