爬取目标:贴吧内容
url:https://tieba.baidu.com/f?
数据要求:输入贴吧名字和页数爬取整页
请求库:urllib
写法:函数
难度:入门
from urllib.request import Request,urlopen
from urllib.parse import urlencode
def get_html(url):
headers = {
'User-Agent': 'Mozilla / 5.0(Windows NT 6.1;Win64;x64) AppleWebKit / 537.36(KHTML, likeGecko) Chrome / 88.0.4324.182 Safari / 537.36 Edg / 88.0.705.74'
}
request = Request(url,headers=headers)
response = urlopen(request)
return response.read()
def save_html(file_name,html_bytes):
with open(file_name,'wb') as f:
f.write(html_bytes)
def main():
tieba_name = input('输入贴吧名:')
page = int(input('输入要爬取的页数:'))
for num in range(0,page):
args = {
'kw':tieba_name,
'ie':'utf-8',
'pn':num * 50
}
url = 'https://tieba.baidu.com/f?' urlencode(args)
file_name = '第' str(num 1) '页.html'
html_bytes = get_html(url)
print('正在下载第%d页'%(num 1))
save_html(file_name,html_bytes)
if __name__ == '__main__':
main()