不可描述片子爬虫案例

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
import re
import requests
from bs4 import BeautifulSoup


def (url):
#请求头
headers = {
"accept-encoding":"gzip, deflate, br",
"accept-language":"zh-CN,zh;q=0.9",
"referer":"https://www.553ca.com/html/1/",
"user-agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36"
}
html = session.get(url=url,headers=headers).text.encode('latin1').decode('utf-8')
soup = BeautifulSoup(html,"html.parser")
content = soup.find('div',class_='box-video-list')
con_list = content.find_all('li','col-md-2 col-sm-3 col-xs-4')
data = []
for list in con_list:
lists = {}
lists["video_pic"] = list.find('a').get('data-original') #影片图片
lists["video_href"] = "https://www.553ca.com"+list.find('a').get('href') #视频跳转链接
lists["title"]= list.find('a').get('title') #影片名称
data.append(lists)
return data


#爬取到视频的真实链接
def devio_url(data):
devio_data =[]
for i in range(len(data)):
list = {}
devio_url = data[i]["video_href"]
session = requests.session()
html = session.get(devio_url).text.encode('latin1').decode('utf-8').strip()
soup = BeautifulSoup(html, "html.parser")
desc = soup.find_all("script")[18]
pattern = re.search(' download="(.*)"', str(desc), re.S)[0]
pat = re.compile('download="(.*)" target="_blank"', re.S)
devio_url = pat.findall(pattern)[0]
list["devio_name"] = data[i]["title"]
list["devio_url"] = devio_url
devio_data.append(list)
return devio_data


#把电影下载到本地
def dewnload(devio_data):
url = devio_data["devio_url"]
r = requests.get(url, stream=True)
try:
with open('D:/爬取视频/%s.mp4'%devio_data["devio_name"], "wb") as mp4:
for chunk in r.iter_content(chunk_size=1024 * 1024):
if chunk:
mp4.write(chunk)
return "%s下载完成"%devio_data["devio_name"]
except:
return "%s下载失败" % devio_data["devio_name"]

if __name__ == '__main__':
url = "https://www.553ca.com/html/65/"
session = requests.session()

data = devio_desc(url)
# 爬取到视频的真实链接(影片名、影片链接)
devio_data=devio_url(data)
#将电影下载到本地
for i in range(len(devio_data)):
result=dewnload(devio_data[i])
print(result)