1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52
|
import urllib.request import os def url_open(url): req = urllib.request.Request(url) req.add_header('User-Agent','Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36') response = urllib.request.urlopen(req) html = response.read() return html def get_page(url): html = url_open(url).decode('utf-8') a = html.find('current-comment-page') + 23 #正好是页码的位置 b = html.find(']',a)#从索引a出开始查找 #print(html[a:b]) return html[a:b]
def find_imgs(url): html = url_open(url).decode('utf-8') img_addrs = []
a = html.find('img src=') while a != -1: b = html.find('.jpg',a,a+255) if b != -1: img_addrs.append(html[a+9:b+4])#图片链接 else: b = a+9 a = html.find('img src=',b)
return img_addrs def save_imgs(folder,img_addrs):
for each in img_addrs: filename = each.split('/')[-1] with open(filename,'wb') as f: img = url_open('http:' + each)#这里需要加上http:,不知道为什么视频里不加就可以 f.write(img)
def download_mm(folder='ooxx',page=10): os.mkdir(folder)#创建文件夹 os.chdir(folder)#改变工作目录
url = 'http://jandan.net/ooxx/' page_num = int(get_page(url)) #页码
for i in range(page): page_num -= i page_url = url + 'page-' + str(page_num) + '#comments' img_addrs = find_imgs(page_url) save_imgs(folder,img_addrs)
if __name__ == '__main__': download_mm()
|
近期评论