抓取小说|flask|六狼博客|技术博客|技术论坛|六狼网络|六狼科技|六狼IT|六狼星球

2025年3月30日

思路：

解析小说HTML解构，抓取主要元素（例如：小说目录元素及小说内容元素）
抓取小说目录及链接，将链接合并生成具体章节的链接（为避免对网站造成压力，仅爬取前几章）
运行代码抓取所有小说内容

# 导入requests库，用于发送HTTP请求
import requests
# 导入BeautifulSoup类，用于解析HTML和XML文档
from bs4 import BeautifulSoup

# 定义要爬取的小说目录页URL
url = 'https://www.biqu04.cc/book/46176/'
# 定义请求头，模拟浏览器访问
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
}

try:
    # 发送HTTP GET请求到指定URL，并传入请求头
    response = requests.get(url, headers=headers)
    # 检查请求是否成功，如果不成功则抛出异常
    response.raise_for_status()
    
    # 使用BeautifulSoup解析响应的HTML内容
    soup = BeautifulSoup(response.text, 'html.parser')
    # 查找class为'listmain'的div元素
    listmain_div = soup.find('div', class_='listmain')
    
    # 初始化一个空列表，用于存储提取的链接信息
    links = []
    # 检查是否找到了'listmain' div元素
    if listmain_div:
        # 遍历'listmain' div元素下的所有a标签
        for a_tag in listmain_div.find_all('a'):
            # 获取a标签的href属性值
            href = a_tag.get('href')
            # 当遇到目标链接时，停止采集
            if href == '/book/46176/20.html':
                # 将目标链接的文本和href信息添加到links列表中
                links.append({
                    'text': a_tag.get_text(strip=True),
                    'href': href
                })
                # 跳出循环
                break
            # 创建一个字典，包含链接的文本和href信息
            link = {
                'text': a_tag.get_text(strip=True),
                'href': href
            }
            # 将链接信息添加到links列表中
            links.append(link)
    
    # 遍历所有提取的链接
    for link in links:
        # 拼接完整的章节URL
        full_url = f"https://www.biqu04.cc{link['href']}"
        try:
            # 发送HTTP GET请求到章节URL，并传入请求头
            chapter_response = requests.get(full_url, headers=headers)
            # 检查请求是否成功，如果不成功则抛出异常
            chapter_response.raise_for_status()
            # 使用BeautifulSoup解析章节页面的HTML内容
            chapter_soup = BeautifulSoup(chapter_response.text, 'html.parser')
            # 查找class为'Readarea ReadAjax_content'的div元素，即章节内容区域
            content_div = chapter_soup.find('div', class_='Readarea ReadAjax_content')
            # 检查是否找到了章节内容区域
            if content_div:
                # 打印章节标题
                print(f"\n=== 章节：{link['text']} ===")
                # 打印章节内容，去除首尾空白字符
                print(content_div.get_text(strip=True))
        except Exception as e:
            # 打印抓取章节失败的信息和错误原因
            print(f"抓取章节失败：{link['text']}，错误：{e}")

except requests.exceptions.RequestException as e:
    # 打印请求失败的信息和错误原因
    print(f"请求失败: {e}")
except Exception as e:
    # 打印发生其他错误的信息和错误原因
    print(f"发生错误: {e}")

一	二	三	四	五	六	日
« 8月
1	2	3	4	5	6	7
8	9	10	11	12	13	14
15	16	17	18	19	20	21
22	23	24	25	26	27	28
29	30