1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95
|
import requests from bs4 import BeautifulSoup import sys
def (url): try: r = requests.get(url, timeout=1) r.status_code return r except: print('链接网址出错,错误网址为:'+url) return ''
def geturls(homepage): r = connectnet(homepage) myurl = [] if r != '': soup = BeautifulSoup(r.text, "html5lib") mulu = soup.find("ul", class_="list_two list_five") for child in mulu.children: data_a = child.encode('utf-8') start = data_a.find('"') end = data_a.find('"', start + 1) if data_a.find('第') > 0: myurl.append(data_a[start + 1:end]) return myurl
def getarticle(article_url, bookname): chacter = '' if article_url != '': rz = connectnet("http://www.fubi8.com/" + bookname + "/" + article_url) if rz == '': return chacter sz = BeautifulSoup(rz.text, "html5lib") article = sz.find("article", class_="mgnt_40") for child in article.children: sn = child.name if type(sn) is unicode and sn != 'center': con = child.text.encode('utf-8').strip() if con != '' and '儒道至圣天启之门一剑飞仙' not in con: chacter += con + 'n' if len(chacter) < 200: chacter = '' for ch in article.children: tp = type(ch.name) if ch.name == 'h1' or tp is not unicode: da = ch.string.strip() if (da != ''): chacter += da + 'n' chacter = chacter.encode('utf-8') return chacter
def write_tofile(data, bookname): with open('e:/mybook/'+bookname+'.txt', 'a') as file_Object: file_Object.write(data)
def console_writeProcessMsg(current_num, sum_num): sys.stdout.write('r') sys.stdout.write(str(current_num) + '/' + str(sum_num)) sys.stdout.flush()
def download_page(bookname): mybook = '' article_homepage = "http://www.fubi8.com/"+bookname+"/index.htm" current_page = 0 url_list = geturls(article_homepage) sum_pages = len(url_list) if sum_pages > 0: print('总共' + str(sum_pages) + '章节') print('开始下载' + bookname) for ul in url_list: current_page += 1 ar = getarticle(ul, bookname) mybook += ar if (ar == ''): print('第'+str(current_page)+'章节获取失败') continue console_writeProcessMsg(current_page, sum_pages) write_tofile(mybook, bookname) print('n' + bookname + ':下载完毕 保存在:e:/mybook/') else: print('目录信息获取失败!!!!!')
if __name__ == "__main__": download_page('shengxu')
|
近期评论