百度贴吧
爬虫初学经典案例
目标
爬取贴吧页面
需求
- 1、输入要爬取的贴吧的名称
- 2、要做翻页的处理 指定起始页和终止页
- 3、把爬取下来的每一页保存到本地 如:1.html 2.html
思路
以艾斯吧url为例。
第二页url:https://tieba.baidu.com/f?kw=艾斯&ie=utf-8&pn=50
第三页url:https://tieba.baidu.com/f?kw=艾斯&ie=utf-8&pn=100
第四页url:https://tieba.baidu.com/f?kw=艾斯&ie=utf-8&pn=150
url构成:
主体:https://tieba.baidu.com/f
主题:kw=艾斯
翻页:pn = i * 50(i为从0开始的正整数)实际代码
from urllib import parse,request
class bd_tieba:
"""
https://tieba.baidu.com/f?
kw=fate
&pn=n
n = i * 50
"""
def __init__(self):
self.url_head = "https://tieba.baidu.com/f?"
self.headers = {"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36 Edg/92.0.902.62"}
kw = input("请输入贴吧名:")
self.start = int(input("请输入起始页:"))
self.end = int(input("请输入终止页:"))
self.key = {"kw":kw}
self.kw = parse.urlencode(self.key)
""" 获取数据 """
def gain_date(self,pni):
pn = "&pn="+str(pni*50)
url = self.url_head + self.kw + pn
print("url:"+str(url))
req = request.Request(url = url, headers=self.headers)
res = request.urlopen(req)
html = res.read().decode("utf-8")
return html
""" 保存数据 """
def hold_data(self,pni,html):
file_name = "tieba/第"+ str(pni) +"页.html"
with open(file_name,"w",encoding="utf-8") as file:
file.write(html)
print("第"+ str(pni) +"页加载成功!")
""" 主程序 """
def main(self):
for pni in range(self.start, self.end + 1):
html = self.gain_date(pni)
self.hold_data(pni,html)
if __name__ == '__main__':
tb = bd_tieba()
tb.main() 

Comments | NOTHING