Author:Anysun
转载请注明出处,谢谢
非常喜欢 http://guo.lu/ 的图片,但是打包下载还得解压,正好在学习python,用它练练手。
写了一个自动拉去最新15篇文章的小爬虫。
代码
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60
|
__author__ = 'anysun'
import sys import os import urllib import urllib2 import re
def (Posts): print'Starting Download...' image = urllib.URLopener() PostNumber = 0 for var in Posts: PostNumber = PostNumber + 1 PicNumber = 0 content = urllib2.urlopen(var).read() Pics = re.findall('target="_blank" href="(https?://.*?)"><img',content) if os.path.exists(os.getcwd()+'/dump/'+GetFilename(var)) == False: os.mkdir(os.getcwd()+'/dump/'+GetFilename(var)) for i in Pics: PicNumber = PicNumber + 1 Filename = GetFilename(i) image.retrieve(i,os.getcwd()+'/dump/'+GetFilename(var)+'/'+Filename) print '['+bytes(PostNumber)+'/'+bytes(len(Posts))+'] '+'Posts:'+GetFilename(var)+' ['+bytes(PicNumber)+'/'+bytes(len(Pics))+']'+Filename+'....OK!'
def GetFilename(url): Filename = url.split('/') Filename = Filename[len(Filename)-1] return Filename
def PostsLink(): content = urllib2.urlopen('http://guo.lu').read() Posts = re.findall('href="(https?://guo.lu/.*d)">',content) print 'Ok!' return Posts
def Init(): if os.path.exists(os.getcwd()+'/dump') == False: os.mkdir(os.getcwd()+'/dump') print 'Init Dump Dir....Ok!' else: print 'Init....Ok!' print 'Reading List....' Posts = PostsLink() downloadPic(Posts)
def main(): Init()
if __name__ == '__main__': main()
|
近期评论