python_spider_17knovel

this is a demo of spider about 17knovel

"""
    练习;
    17k 小说网
        要求:
            书架上的 n 本书 ;
            按照章节进行爬取并存储到单独的文件夹中;我们就爬取前15章内容

    -----> 优化方向:  数据获取应该做成一个迭代器,文本内容过大的时候,应该按需索取
"""
import os
import time

import requests
from lxml import etree

book_data = []

header = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36",
    "referer": "https://user.17k.com/www/bookshelf/",
    # 没法找到真正的登录入口 -----> 只能凑合了
    # URL 编码 ---->  一些特殊的字符 采用 %数字
    "cookie":"GUID=a28859ab-9982-4c22-bbf2-9adab3632f67; c_channel=0; c_csc=web; Hm_lvt_9793f42b498361373512340937deb2a0=1723358828,1723364404,1723366993,1723449913; HMACCOUNT=C8D3DE63DD2135E9; acw_tc=1a0c638417234526513274433e0096275e8bc883e58405690be4294f8ec677; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%22a28859ab-9982-4c22-bbf2-9adab3632f67%22%2C%22%24device_id%22%3A%221913c804c2e589-0b20c44b5173c3-26001e51-3686400-1913c804c2fae5%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_referrer%22%3A%22%22%2C%22%24latest_referrer_host%22%3A%22%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%7D%2C%22first_id%22%3A%22a28859ab-9982-4c22-bbf2-9adab3632f67%22%7D; Hm_lpvt_9793f42b498361373512340937deb2a0=1723452655; acw_sc__v2=66b9ccf3971a4936cd99f9bb6be954ddd33790ac; accessToken=avatarUrl%3Dhttps%253A%252F%252Fcdn.static.17k.com%252Fuser%252Favatar%252F15%252F95%252F54%252F103705495.jpg-88x88%253Fv%253D1723364559000%26id%3D103705495%26nickname%3DAkabane71%26e%3D1739004659%26s%3D3b5857d00c5e80e6; ssxmod_itna=iqGxci0=eYqeq7KGHqvnp70hhxj20DRxY5oqbXdD/CQmDnqD=GFDK40ooSxDCYm0Ax+zoFh74qqzYn8ihwe8tGETKLKDEyPoDU4i8DCq4wdfDeetD5xGoDPxDeDACKiTDY4Dd06s5tDEDYQDYqGRDB60mqDffqGWGT=DmdiDGTIXD7QDIkt4DDNEmRxNn4RM/bR4ueDSGBt5CDtDjkTD/RDkoDU3X=H49MUHwc2aWM0qxBQD7LiBDYE0ReDHILN5q8Ga77DoD+h6QAprShm4G0wUhGDH+1GPQowjSYxHB4YlxD357H4aQiDD; ssxmod_itna2=iqGxci0=eYqeq7KGHqvnp70hhxj20DRxY5oqbNG9WylDBLeD5GaGYfdbGoAx82x=5XSxmKQwKhilHiOQa/R7XxiqR0f6jtYm1O2Gl8fPQ6eO2FU8gnwPkj62cww5lQMjuhvZZa/QE8dA7iYqRfLm4RxjI+INKfvLz3b8IWYhtdAzFbrRAoR7/n2iymWhQPd=NaajYenzDccaP1YFsr98Ecnh01ykcaazc6yzq/Hih77pw1gTTIOweKOpw/px8zAv1zjjg/P6MKnLv954qVn2H=AREEf87sYb/h0pbsYqKLdlLzsu6FyLxXHBx1r0fPiP=iXOiV6NVG0T2i5E4DQF4UrQSxqiGi7GlU7HC0aBB+fxof4Heb3h05CmQmxvWexm5doBPuAiYx/7i4b0rYiQugtMEObDDFqD+cPGF+8xE8rDxD==; tfstk=f52S9aw0IUY7_drj-bIqGgKM-ckQFz6NFHiLjkpyvYH8vvZLx2urT2PjHcUo98kz4rsQkPqrTYeQipZgx8SoLgDuqvDd7NWZduqoKnYvtx2SHtnaDDLJegrh6bkd7NW2jSbwfvFeDP04zo3mA0K-wyKAcDinpLhpeILxoDHKJy3KktnsYDdKvvhHAEiFNDE5DOwXUxIIIu3XCo2m2B3dIqJdd-i7cVE8XppLh0G7LK2QRPhz9lcgUuWBnYr_Mxn7JOQ-HlFbbbw55LunO-a7OzSetmNQvRDoCHB8cYgS68GkxL4TckFZG-SOIYM-yWkuYh_zc8azq-ZevIHSUSG_FABMy2P4fJiQKwXb5uFTJWsPLdu6VTABcfvKcVSfcBAExXXHlRE9rcG-m0BNciTXtbnmcaIfcBlSwmm4wisXPWf.."
}

"""
    网站的cookies 是动态变化的, 所以要先模拟登录获取响应的 cookies
"""
session = requests.Session()
# 模拟登录 拿到 对应的 cookies
session.get(
    url='https://api.17k.com/pv/log.php?Platform=Web&Guid=a28859ab-9982-4c22-bbf2-9adab3632f67&Uid=103705495&Nickname=Akabane71&cpsSource=0&Channel=web&callback=Q_jsonp_826718',
    headers=header)

# 获取书架上的数据
def get_json_data():

    rep = session.get('https://user.17k.com/ck/author2/shelf?page=1&appKey=2406394919', headers=header)

    json_data = rep.json()
    items = json_data.get("data")
    print(len(items))
    for item in items:
        book_name = item.get("bookName")
        book_id = item.get("bookId")
        data = {
            "bookName": book_name,
            "bookId": book_id,
        }
        book_data.append(data)
        return


# 用bookId来访问对应的页面
def use_booId_get_html():
    for data in book_data:
        book_id = data["bookId"]
        url = f"https://www.17k.com/list/{book_id}.html"
        # 延时
        rep = session.get(url, headers=header)
        tree = etree.HTML(rep.text)
        get_chapter_data(tree)
        time.sleep(2)
        return


book_chapter_data = []
# 解析每个章节对应的url
def get_chapter_data(tree):
    a_items = tree.xpath("/html/body/div[5]/dl[1]/dd//a[position()<16]")
    for a_item in a_items:
        href = "https://www.17k.com" + a_item.xpath("@href")[0]
        book_chapter_data.append(href)
        print(href)



download_data = []
# 解析每个章节的内容
def get_chapter_title_text():
    for href in book_chapter_data:
        # 每次停留3s再去爬取
        time.sleep(3)
        res = session.get(href, headers=header)
        res.encoding = "utf8"
        tree = etree.HTML(res.content.decode("utf-8"))
        title = tree.xpath('//*[@id="readArea"]/div[1]/h1/text()')[0]
        print(title)
        p_items = tree.xpath('//*[@id="readArea"]/div[1]/div[2]//p')
        # print(p_items)
        # 清空
        total_data = ''
        for p_item in p_items:
            total_data += p_item.xpath('text()')[0]+"/n"
        # 内容乱码,需要js逆向
        data = {
            "title": title,
            "text": total_data
        }
        download_data.append(data)


# 持久化存储
def save_book_data():
    i = 1
    for data in download_data:
        title = data["title"]
        text = data["text"]
        datas = text.split("/n")
        with open(f"./book/{i} {title}.txt", "w", encoding="utf8")as f:
            f.write(f"{title}\n")
            for word in datas:
                f.write("{}\n".format(word))
        i += 1



if __name__ == '__main__':
    # 1. 获取收藏栏数据
    get_json_data()
    # 2. 挨个解析数据
    use_booId_get_html()
    # 3. 访问具体的网页去获取具体的小说数据
    get_chapter_title_text()
    # 4. 持久化存储
    save_book_data()
github