import os
import urllib
print 'Please enter a URL like http://www.py4inf.com/cover.jpg'
urlstr = raw_input().strip()
img = urllib.urlopen(urlstr)
# Get the last "word"
words = urlstr.split('/')
fname = words[-1]
# Don't overwrite the file
if os.path.exists(fname) :
    if raw_input('Replace '+fname+' (Y/n)?') != 'Y' :
        print 'Data not copied'
        exit()
    print 'Replacing',fname
fhand = open(fname, 'w')
size = 0
while True:
    info = img.read(100000)
    if len(info) < 1 : break
    size = size + len(info)
    fhand.write(info)
print size,'characters copied to',fname
fhand.close()

爬去某页面在本站的链接

import requests
import re

def get_pages(link):
    links_to_visit = []
    links_to_visit.append(link)
    while links_to_visit:
        current_link = links_to_visit.pop(0)
        page = requests.get(current_link)
        for url in re.findall('<a href="([^"]+)">', str(page.content)):
            if url[0] == '/':
                url = current_link + url[1:]
            pattern = re.compile('https?')
            if pattern.match(url):
                links_to_visit.append(url)
        yield current_link


webpage = get_pages('http://sample.com')
for result in webpage:
    print(result)


# '<a href="([^"]+)">

用括号提取匹配内容中的链接，但不包括外部网站链接

([^"]+)"匹配一个或多个非"字符

外部链接通常如下<a href="http://www.ccin.com/contact-us/" target="_blank">']

爬虫代码段爬去某页面在本站的链接

爬去某页面在本站的链接

近期文章

近期评论

标签

热门

文章归档

分类目录

功能