crawler pictures on microsoft

crawler_pictures on http://www.officeplus.cn

To Get my background pictures

Code:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52


# from bs4 import BeautifulSoup
# import urllib.request
#
# def retrieve(url):
# html = urllib.request.urlopen(url)
# soup = BeautifulSoup(html,'html.parser')
# imgs = soup.findAll('img')
# for img in imgs:
# src = img.get('src')
# print(src)
#
# if __name__ == "__main__":
# retrieve('http://www.officeplus.cn/List.shtml?cat=IMAGE&tag=19&order=1')

from selenium import webdriver
from bs4 import BeautifulSoup
import urllib.request
import os

srcs = []
names = []

def (url):
driver = webdriver.Firefox()
driver.get(url)
lis = driver.find_elements_by_class_name('dlink')
for li in lis:
if li.get_attribute('data-href') == None:
continue
elif li.get_attribute('data-href') == 'http://www.officeplus.cn/images/officeplus/loading_1.jpg':
srcs.append(li.get_attribute('lazy-src'))
else:
srcs.append(li.get_attribute('data-href'))
# names.append(li.get_attribute('alt'))

def store():
path = os.getcwd()
new_path = os.path.join(path, u'pictures')
if not os.path.isdir(new_path):
os.mkdir(new_path)
for i in range(len(srcs)):
PATH = new_path + '\' + str(i) + '.jpg'
urllib.request.urlretrieve(srcs[i],PATH)
print (i , 'sucess!' , 'NEXT PAGE!')

if __name__ == "__main__":
browser('http://www.officeplus.cn/List.shtml?cat=IMAGE&tag=19&order=1')
store()
print ((srcs))
print(len(srcs))
  • just record my crawler life~