1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85
|
import os import requests from pyquery import PyQuery as pq
from config import *
def (url): try: requests.packages.urllib3.disable_warnings() response = requests.get(url,verify=False) if response.status_code == 200: return response.text else: return None except ConnectionError as e: print(e)
def parse_index_page(html): doc = pq(html) items = doc('#list dl dd').items() i = 0 for item in items: if not i < 12: link = item.find('a').attr('href') print(item.text() + 't' + link) yield link,item.text() i += 1
def parse_chapter_text(html): doc = pq(html) items = doc('#content').items() for item in items: return item.text()
def get_chapter_page(url): try: requests.packages.urllib3.disable_warnings() response = requests.get(url,verify=False) if response.status_code == 200: return response.text else: return None except ConnectionError as e: print(e)
def save_chapter(text,chapter_name,numbering):
file_name = str(numbering) + chapter_name + '.txt' file_path = '{0}/{1}'.format(os.getcwd(),BOOK_NAME) mkdir(file_path) file_path = '{0}/{1}/{2}'.format(os.getcwd(), BOOK_NAME,file_name) try: with open(file_path,'w',encoding='utf-8') as f: f.write(text) print('保存' + chapter_name + ' 成功') except: print(chapter_name + '保存失败')
def mkdir(path): folder = os.path.exists(path) if not folder: os.makedirs(path)
def main(book): html = get_index_page(START_URL+book) chapter_links = parse_index_page(html) numbering = 1 for chapter_link,chapter_name in chapter_links: html = get_chapter_page(START_URL + chapter_link) text = parse_chapter_text(html) save_chapter(text,chapter_name,numbering) numbering += 1
if __name__ == '__main__': main(BOOK_CODE)
|
近期评论