python爬虫爬取某网站数据

爬虫的学习:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
#
import re
import requests
import threading
import os
# 请求头
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.119 Safari/537.36'
}
def ():
'''抓取连接'''
page = input("请输入抓取的页面编号:(1,2,3,...):")
page_url = "http://www.xxxx.com/article/list/?page="+str(page)
# 得到网页源代码
html = requests.get(page_url, headers=headers)
html.encoding = "utf-8"
html = html.text
# 匹配得到每页的表情数据
url_title_list = re.findall(r'<a href="(.*?)" class="list-group-item random_list">', html)
# 开启线程下载每页图片
t1 = threading.Thread(target=down_img, args=(url_title_list,))
t1.start()
t1.join()
def down_img(datas):
'''下载图片'''
# print("datalist: %s" %datas)
try:
for img_url in datas:
html = requests.get(img_url, headers=headers)
html.encoding = "utf-8"
html = html.text
urls = re.findall(r'<img src=".*?" alt=".*?" onerror="this.src='(.*?)'">', html)
# print("imgurls: %s" % urls)
for url in urls:
r = requests.get(url, headers = headers)
r.raise_for_status()
# 逆序截取一段字符串
reverse_url = url[:-url.index("image"):-1]
# 截取图片全名称
img_full_name = reverse_url[:reverse_url.index("/")][::-1]
# print("res_img_url: %s n rev_sub: %s n sub: %s" %(url, reverse_url, img_full_name))
# 创建imgs文件夹
if not os.path.exists("imgs"):
os.mkdir("imgs")
# is_contains = len(img_full_name and ".gif")
is_contains = img_full_name.find(".gif") > -1
# print("is_contains gif: %s" % is_contains)
if is_contains :
with open(r'D:/python/imgs/'+img_full_name, "wb") as f:
f.write(r.content)
pass
print("down_img done!!!")
is_continue = input("是否继续下载?(y or n):")
if is_continue == 'y':
main()
except Exception as e:
print("down_img failed. %s" %e.message)
raise
else:
pass
finally:
pass
pass
if __name__ == '__main__':
main()