
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
|
import os import urllib print 'Please enter a URL like http://www.py4inf.com/cover.jpg' urlstr = raw_input().strip() img = urllib.urlopen(urlstr) # Get the last "word" words = urlstr.split('/') fname = words[-1] # Don't overwrite the file if os.path.exists(fname) : if raw_input('Replace '+fname+' (Y/n)?') != 'Y' : print 'Data not copied' exit() print 'Replacing',fname fhand = open(fname, 'w') size = 0 while True: info = img.read(100000) if len(info) < 1 : break size = size + len(info) fhand.write(info) print size,'characters copied to',fname fhand.close()
|
http://www.jianshu.com/p/7e86f0505656
爬去某页面在本站的链接
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30
|
import requests import re
def get_pages(link): links_to_visit = [] links_to_visit.append(link) while links_to_visit: current_link = links_to_visit.pop(0) page = requests.get(current_link) for url in re.findall('<a href="([^"]+)">', str(page.content)): if url[0] == '/': url = current_link + url[1:] pattern = re.compile('https?') if pattern.match(url): links_to_visit.append(url) yield current_link
webpage = get_pages('http://sample.com') for result in webpage: print(result)
# '<a href="([^"]+)">
用括号提取匹配内容中的链接,但不包括外部网站链接
([^"]+)"匹配一个或多个非"字符
外部链接通常如下<a href="http://www.ccin.com/contact-us/" target="_blank">']
|
近期评论