bs4库应用(三)

杀不尽的欧洲狗流不尽的非洲泪!

import requests
import re
from bs4 import BeautifulSoup

def get_html(url):
    try:
        r = requests.get(url, timeout=30)
        r.raise_for_status()
        r.encoding = 'utf-8'
        return r.text
    except:
        return " ERROR "

def get_list(url):
    #获取最大页数
    soup = BeautifulSoup(get_html(url),'lxml')
    list = soup.find('li', class_='l_pager pager_theme_4 pb_list_pager').contents
    max_page =  re.findall(r"pn=(.+?)\"", str(list[-2]))[0]
    page = input("输入爬取页数<="+max_page)
    url_list=[]
    for i in range(0, int(page)):
        url_list.append(url + '?pn=' + str(1+i))
    return url_list

def get_content(url_list):
    print("爬取中。。。")
    num = 0
    for url in url_list:
        soup = BeautifulSoup(get_html(url), 'lxml')
        list = soup.find_all('img', class_="BDE_Image")
        for li in list:
            print(num)
            try:
                num = num+1
                with open('F:/爬虫/img/' + str(num) + li['src'][-4:], 'wb+') as f:
                    f.write(requests.get(li['src']).content)
            except:
                num = num - 1
                print("error")
    print("爬取结束共获得"+str(num)+"个图片")

url = "https://tieba.baidu.com/p/6289398506"
get_content(get_list(url))

下图片
with open(‘F:/爬虫/img/‘ + str(num) + li[‘src’][-4:], ‘wb+’) as f:
f.write(requests.get(li[‘src’]).content)


   转载规则


《bs4库应用(三)》 刘坤胤 采用 知识共享署名 4.0 国际许可协议 进行许可。