【爬虫】批量采集美女头像

代码改一改也可以获取别的类型的头像，不一定局限于美女

import requests
from bs4 import BeautifulSoup
import os
import threading

# 下载单张图片
def download_image(img_url, save_dir):
    try:
        img_data = requests.get(img_url, timeout=10).content
        img_name = os.path.join(save_dir, img_url.split("/")[-1])
        with open(img_name, "wb") as f:
            f.write(img_data)
        print(f"下载成功: {img_url} -> {img_name}")
    except Exception as e:
        print(f"下载失败: {img_url}, 错误信息: {e}")

# 爬取详情页中的图片
def scrape_detail_page(detail_url, save_dir):
    try:
        print(f"正在爬取详情页: {detail_url}")
        response = requests.get(detail_url, timeout=10)
        if response.status_code != 200:
            print(f"无法访问详情页: {detail_url}, 状态码: {response.status_code}")
            return

        soup = BeautifulSoup(response.content, "html.parser")
        img_tags = soup.select("#content p img")  # 选择详情页中的图片标签

        for img_tag in img_tags:
            img_url = img_tag["src"]
            download_image(img_url, save_dir)
    except Exception as e:
        print(f"爬取详情页失败: {detail_url}, 错误信息: {e}")

# 爬取主页面，获取所有详情页链接
def scrape_images(base_url, save_dir, thread_count):
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    page = 1
    threads = []

    while True:
        try:
            # 生成分页 URL
            url = base_url.replace("_1", f"_{page}")
            print(f"正在爬取页面: {url}")
            response = requests.get(url, timeout=10)

            if response.status_code != 200:
                print(f"无法访问页面: {url}, 状态码: {response.status_code}")
                break

            soup = BeautifulSoup(response.content, "html.parser")
            detail_links = soup.select("ul.g-gxlist-imgbox li a")

            if not detail_links:
                print("未找到更多详情页链接，爬取结束。")
                break

            for link in detail_links:
                detail_url = "https://www.qqtn.com" + link["href"]

                # 创建线程爬取详情页
                thread = threading.Thread(target=scrape_detail_page, args=(detail_url, save_dir))
                threads.append(thread)
                thread.start()

                # 控制线程数量
                while len(threads) >= thread_count:
                    for t in threads:
                        t.join(0.1)
                    threads = [t for t in threads if t.is_alive()]

            page += 1
        except Exception as e:
            print(f"爬取页面失败: {url}, 错误信息: {e}")
            continue

    # 等待所有线程结束
    for t in threads:
        t.join()

# 示例用法
base_url = "https://www.qqtn.com/tx/nvshengtx_1.html"
save_directory = "qqtn_images"
thread_count = 5  # 可调整线程数量
scrape_images(base_url, save_directory, thread_count)

[/details]

import requests
from bs4 import BeautifulSoup
import os
import threading
from urllib.parse import urljoin  # 用于处理相对URL

# 定义一个通用的函数来获取响应，设置User-Agent和可选的Referer
def get_response(url, referer=None):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',  # 模拟Chrome浏览器
    }
    if referer:
        headers['Referer'] = referer  # 如果有Referer，添加它
    try:
        response = requests.get(url, headers=headers, timeout=10)
        return response
    except Exception as e:
        print(f"请求失败: {url}, 错误信息: {e}")
        return None

# 下载单张图片并保存到本地
def download_image(img_url, save_dir, referer_url):
    try:
        response = get_response(img_url, referer_url)  # 使用get_response函数，传递Referer
        if response and response.status_code == 200:
            img_name = os.path.join(save_dir, img_url.split("/")[-1])  # 使用URL最后一部分作为文件名
            with open(img_name, "wb") as f:
                f.write(response.content)
            print(f"下载成功: {img_url} -> {img_name}")
        else:
            print(f"下载失败: {img_url}, 状态码: {response.status_code if response else 'None'}")
    except Exception as e:
        print(f"下载失败: {img_url}, 错误信息: {e}")

# 爬取详情页中的图片
def scrape_detail_page(detail_url, save_dir):
    try:
        print(f"正在爬取详情页: {detail_url}")
        response = get_response(detail_url)  # 使用get_response函数，添加User-Agent
        if response and response.status_code == 200:
            soup = BeautifulSoup(response.content, "html.parser")
            img_tags = soup.find_all("img", alt=True)  # 查找所有有alt属性的img标签
            for img_tag in img_tags:
                img_src = img_tag["src"]
                img_url = urljoin(detail_url, img_src)  # 处理相对URL，转换为绝对URL
                download_image(img_url, save_dir, detail_url)  # 传递详情页URL作为Referer
        elif response:
            print(f"无法访问详情页: {detail_url}, 状态码: {response.status_code}")
        else:
            print(f"请求详情页失败: {detail_url}")
    except Exception as e:
        print(f"爬取详情页失败: {detail_url}, 错误信息: {e}")

# 爬取主页面的图片和详情链接
def scrape_images(base_url, save_dir, thread_count):
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    page = 1
    threads = []
    while True:
        try:
            # 生成分页URL
            url = f"{base_url}index_{page}.html" if page > 1 else base_url
            print(f"正在爬取页面: {url}")
            response = get_response(url)  # 使用get_response函数，添加User-Agent
            if response and response.status_code == 200:
                soup = BeautifulSoup(response.content, "html.parser")
                detail_links = soup.select("ul.g-gxlist-imgbox li a")  # 查找详情页链接
                if not detail_links:
                    print("未找到更多详情页链接，爬取结束。")
                    break
                for link in detail_links:
                    detail_href = link["href"]
                    detail_url = urljoin(base_url, detail_href)  # 处理相对URL，转换为绝对URL
                    # 创建线程爬取详情页
                    thread = threading.Thread(target=scrape_detail_page, args=(detail_url, save_dir))
                    threads.append(thread)
                    thread.start()
                    # 控制线程数量
                    while len(threads) >= thread_count:
                        for t in threads:
                            t.join(0.1)  # 短暂等待，检查线程状态
                        threads = [t for t in threads if t.is_alive()]  # 移除已完成的线程
            elif response:
                print(f"无法访问页面: {url}, 状态码: {response.status_code}")
                break  # 如果状态码不是200，结束循环
            else:
                print(f"请求页面失败: {url}")
                continue  # 如果请求失败，继续下一页
            page += 1
        except Exception as e:
            print(f"爬取页面失败: {url}, 错误信息: {e}")
            continue  # 继续下一页
    # 等待所有线程结束
    for t in threads:
        t.join()

# 示例用法
base_url = "http://www.imeitou.com/nvsheng/omfns/"
save_directory = "imeitou_downloads"
thread_count = 5  # 可调整线程数量
scrape_images(base_url, save_directory, thread_count)

2026 年 6 月
日	一	二	三	四	五	六
	1	2	3	4	5	6
7	8	9	10	11	12	13
14	15	16	17	18	19	20
21	22	23	24	25	26	27
28	29	30