代码改一改也可以获取别的类型的头像,不一定局限于美女
import requests
from bs4 import BeautifulSoup
import os
import threading
# 下载单张图片
def download_image(img_url, save_dir):
try:
img_data = requests.get(img_url, timeout=10).content
img_name = os.path.join(save_dir, img_url.split("/")[-1])
with open(img_name, "wb") as f:
f.write(img_data)
print(f"下载成功: {img_url} -> {img_name}")
except Exception as e:
print(f"下载失败: {img_url}, 错误信息: {e}")
# 爬取详情页中的图片
def scrape_detail_page(detail_url, save_dir):
try:
print(f"正在爬取详情页: {detail_url}")
response = requests.get(detail_url, timeout=10)
if response.status_code != 200:
print(f"无法访问详情页: {detail_url}, 状态码: {response.status_code}")
return
soup = BeautifulSoup(response.content, "html.parser")
img_tags = soup.select("#content p img") # 选择详情页中的图片标签
for img_tag in img_tags:
img_url = img_tag["src"]
download_image(img_url, save_dir)
except Exception as e:
print(f"爬取详情页失败: {detail_url}, 错误信息: {e}")
# 爬取主页面,获取所有详情页链接
def scrape_images(base_url, save_dir, thread_count):
if not os.path.exists(save_dir):
os.makedirs(save_dir)
page = 1
threads = []
while True:
try:
# 生成分页 URL
url = base_url.replace("_1", f"_{page}")
print(f"正在爬取页面: {url}")
response = requests.get(url, timeout=10)
if response.status_code != 200:
print(f"无法访问页面: {url}, 状态码: {response.status_code}")
break
soup = BeautifulSoup(response.content, "html.parser")
detail_links = soup.select("ul.g-gxlist-imgbox li a")
if not detail_links:
print("未找到更多详情页链接,爬取结束。")
break
for link in detail_links:
detail_url = "https://www.qqtn.com" + link["href"]
# 创建线程爬取详情页
thread = threading.Thread(target=scrape_detail_page, args=(detail_url, save_dir))
threads.append(thread)
thread.start()
# 控制线程数量
while len(threads) >= thread_count:
for t in threads:
t.join(0.1)
threads = [t for t in threads if t.is_alive()]
page += 1
except Exception as e:
print(f"爬取页面失败: {url}, 错误信息: {e}")
continue
# 等待所有线程结束
for t in threads:
t.join()
# 示例用法
base_url = "https://www.qqtn.com/tx/nvshengtx_1.html"
save_directory = "qqtn_images"
thread_count = 5 # 可调整线程数量
scrape_images(base_url, save_directory, thread_count)
[/details]
import requests
from bs4 import BeautifulSoup
import os
import threading
from urllib.parse import urljoin # 用于处理相对URL
# 定义一个通用的函数来获取响应,设置User-Agent和可选的Referer
def get_response(url, referer=None):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3', # 模拟Chrome浏览器
}
if referer:
headers['Referer'] = referer # 如果有Referer,添加它
try:
response = requests.get(url, headers=headers, timeout=10)
return response
except Exception as e:
print(f"请求失败: {url}, 错误信息: {e}")
return None
# 下载单张图片并保存到本地
def download_image(img_url, save_dir, referer_url):
try:
response = get_response(img_url, referer_url) # 使用get_response函数,传递Referer
if response and response.status_code == 200:
img_name = os.path.join(save_dir, img_url.split("/")[-1]) # 使用URL最后一部分作为文件名
with open(img_name, "wb") as f:
f.write(response.content)
print(f"下载成功: {img_url} -> {img_name}")
else:
print(f"下载失败: {img_url}, 状态码: {response.status_code if response else 'None'}")
except Exception as e:
print(f"下载失败: {img_url}, 错误信息: {e}")
# 爬取详情页中的图片
def scrape_detail_page(detail_url, save_dir):
try:
print(f"正在爬取详情页: {detail_url}")
response = get_response(detail_url) # 使用get_response函数,添加User-Agent
if response and response.status_code == 200:
soup = BeautifulSoup(response.content, "html.parser")
img_tags = soup.find_all("img", alt=True) # 查找所有有alt属性的img标签
for img_tag in img_tags:
img_src = img_tag["src"]
img_url = urljoin(detail_url, img_src) # 处理相对URL,转换为绝对URL
download_image(img_url, save_dir, detail_url) # 传递详情页URL作为Referer
elif response:
print(f"无法访问详情页: {detail_url}, 状态码: {response.status_code}")
else:
print(f"请求详情页失败: {detail_url}")
except Exception as e:
print(f"爬取详情页失败: {detail_url}, 错误信息: {e}")
# 爬取主页面的图片和详情链接
def scrape_images(base_url, save_dir, thread_count):
if not os.path.exists(save_dir):
os.makedirs(save_dir)
page = 1
threads = []
while True:
try:
# 生成分页URL
url = f"{base_url}index_{page}.html" if page > 1 else base_url
print(f"正在爬取页面: {url}")
response = get_response(url) # 使用get_response函数,添加User-Agent
if response and response.status_code == 200:
soup = BeautifulSoup(response.content, "html.parser")
detail_links = soup.select("ul.g-gxlist-imgbox li a") # 查找详情页链接
if not detail_links:
print("未找到更多详情页链接,爬取结束。")
break
for link in detail_links:
detail_href = link["href"]
detail_url = urljoin(base_url, detail_href) # 处理相对URL,转换为绝对URL
# 创建线程爬取详情页
thread = threading.Thread(target=scrape_detail_page, args=(detail_url, save_dir))
threads.append(thread)
thread.start()
# 控制线程数量
while len(threads) >= thread_count:
for t in threads:
t.join(0.1) # 短暂等待,检查线程状态
threads = [t for t in threads if t.is_alive()] # 移除已完成的线程
elif response:
print(f"无法访问页面: {url}, 状态码: {response.status_code}")
break # 如果状态码不是200,结束循环
else:
print(f"请求页面失败: {url}")
continue # 如果请求失败,继续下一页
page += 1
except Exception as e:
print(f"爬取页面失败: {url}, 错误信息: {e}")
continue # 继续下一页
# 等待所有线程结束
for t in threads:
t.join()
# 示例用法
base_url = "http://www.imeitou.com/nvsheng/omfns/"
save_directory = "imeitou_downloads"
thread_count = 5 # 可调整线程数量
scrape_images(base_url, save_directory, thread_count)