杀不尽的欧洲狗流不尽的非洲泪!
import requests
import re
from bs4 import BeautifulSoup
def get_html(url):
try:
r = requests.get(url, timeout=30)
r.raise_for_status()
r.encoding = 'utf-8'
return r.text
except:
return " ERROR "
def get_list(url):
#获取最大页数
soup = BeautifulSoup(get_html(url),'lxml')
list = soup.find('li', class_='l_pager pager_theme_4 pb_list_pager').contents
max_page = re.findall(r"pn=(.+?)\"", str(list[-2]))[0]
page = input("输入爬取页数<="+max_page)
url_list=[]
for i in range(0, int(page)):
url_list.append(url + '?pn=' + str(1+i))
return url_list
def get_content(url_list):
print("爬取中。。。")
num = 0
for url in url_list:
soup = BeautifulSoup(get_html(url), 'lxml')
list = soup.find_all('img', class_="BDE_Image")
for li in list:
print(num)
try:
num = num+1
with open('F:/爬虫/img/' + str(num) + li['src'][-4:], 'wb+') as f:
f.write(requests.get(li['src']).content)
except:
num = num - 1
print("error")
print("爬取结束共获得"+str(num)+"个图片")
url = "https://tieba.baidu.com/p/6289398506"
get_content(get_list(url))
下图片
with open(‘F:/爬虫/img/‘ + str(num) + li[‘src’][-4:], ‘wb+’) as f:
f.write(requests.get(li[‘src’]).content)