第一次爬虫

没事写了写

写了一点爬虫,爬点小说

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95

import requests
from bs4 import BeautifulSoup
import sys

def (url):
try:
r = requests.get(url, timeout=1)
r.status_code
return r
except:
print('链接网址出错,错误网址为:'+url)
return ''

def geturls(homepage):
# r = requests.get(homepage)
r = connectnet(homepage)
myurl = []
if r != '':
soup = BeautifulSoup(r.text, "html5lib")
mulu = soup.find("ul", class_="list_two list_five")
for child in mulu.children:
data_a = child.encode('utf-8')
start = data_a.find('"')
end = data_a.find('"', start + 1)
if data_a.find('第') > 0:
myurl.append(data_a[start + 1:end])
return myurl


def getarticle(article_url, bookname):
# rz = requests.get("http://www.fubi8.com/"+bookname+"/"+article_url)
chacter = ''
if article_url != '':
rz = connectnet("http://www.fubi8.com/" + bookname + "/" + article_url)
if rz == '':
return chacter
sz = BeautifulSoup(rz.text, "html5lib")
article = sz.find("article", class_="mgnt_40")
for child in article.children:
sn = child.name
if type(sn) is unicode and sn != 'center':
con = child.text.encode('utf-8').strip()
if con != '' and '儒道至圣天启之门一剑飞仙' not in con:
chacter += con + 'n'
if len(chacter) < 200:
chacter = ''
for ch in article.children:
tp = type(ch.name)
if ch.name == 'h1' or tp is not unicode:
da = ch.string.strip()
if (da != ''):
chacter += da + 'n'
chacter = chacter.encode('utf-8')
return chacter


def write_tofile(data, bookname):
with open('e:/mybook/'+bookname+'.txt', 'a') as file_Object:
file_Object.write(data)



def console_writeProcessMsg(current_num, sum_num):
sys.stdout.write('r')
sys.stdout.write(str(current_num) + '/' + str(sum_num))
sys.stdout.flush()


def download_page(bookname):
mybook = ''
article_homepage = "http://www.fubi8.com/"+bookname+"/index.htm"
current_page = 0
url_list = geturls(article_homepage)
sum_pages = len(url_list)
if sum_pages > 0:
print('总共' + str(sum_pages) + '章节')
print('开始下载' + bookname)
for ul in url_list:
current_page += 1
ar = getarticle(ul, bookname)
mybook += ar
if (ar == ''):
print('第'+str(current_page)+'章节获取失败')
continue
console_writeProcessMsg(current_page, sum_pages)
write_tofile(mybook, bookname)
print('n' + bookname + ':下载完毕 保存在:e:/mybook/')
else:
print('目录信息获取失败!!!!!')


#输入下载的小说名字
if __name__ == "__main__":
download_page('shengxu')