python笔记——初学python拉取图片站爬虫

Author:Anysun
转载请注明出处,谢谢

非常喜欢 http://guo.lu/ 的图片,但是打包下载还得解压,正好在学习python,用它练练手。
写了一个自动拉去最新15篇文章的小爬虫。

[title]

代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60

# coding:utf8

__author__ = 'anysun'

import sys
import os
import urllib
import urllib2
import re

#Download Pictures
def (Posts):
print'Starting Download...'
image = urllib.URLopener()
PostNumber = 0
for var in Posts:
PostNumber = PostNumber + 1
PicNumber = 0
content = urllib2.urlopen(var).read()
Pics = re.findall('target="_blank" href="(https?://.*?)"><img',content)
if os.path.exists(os.getcwd()+'/dump/'+GetFilename(var)) == False:
os.mkdir(os.getcwd()+'/dump/'+GetFilename(var))
for i in Pics:
PicNumber = PicNumber + 1
Filename = GetFilename(i)
image.retrieve(i,os.getcwd()+'/dump/'+GetFilename(var)+'/'+Filename)
print '['+bytes(PostNumber)+'/'+bytes(len(Posts))+'] '+'Posts:'+GetFilename(var)+' ['+bytes(PicNumber)+'/'+bytes(len(Pics))+']'+Filename+'....OK!'



#Get Save Filename
def GetFilename(url):
Filename = url.split('/')
Filename = Filename[len(Filename)-1]
return Filename

def PostsLink():
content = urllib2.urlopen('http://guo.lu').read()
Posts = re.findall('href="(https?://guo.lu/.*d)">',content)
print 'Ok!'
return Posts

def Init():
if os.path.exists(os.getcwd()+'/dump') == False:
os.mkdir(os.getcwd()+'/dump')
print 'Init Dump Dir....Ok!'
else:
print 'Init....Ok!'
print 'Reading List....'
Posts = PostsLink()
downloadPic(Posts)
#main
def main():
Init()
#PostsLink()


if __name__ == '__main__':
main()