起因是想在樱花动漫找一部番,然后发现网页有视频文件的地址,所以想到用python自动化下载,python好久不用了,整体写起来实在垃圾,以下是编写代码的过程
解析页面,获取动漫的所有集播放链接
这部分源码来源于这篇博客
python
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
import os
from selenium import webdriver
import requests
from bs4 import BeautifulSoup
import re
header = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36"
}
url = "http://www.yhdm.io/show/889.html"
r = requests.get(url, headers=header)
html = r.text
soup = BeautifulSoup(html, "html.parser")
urls = soup.find("div", class_="movurl").find_all("a")
videourls = []
for u in urls:
videourls.append(u.get("href"))
vstr = "http://www.yhdm.io/"
for index, item in enumerate(videourls):
videourls[index] = vstr + item
变量videourls就会成为一个储存所有集播放链接的列表,为下面获取具体的MP4链接做准备
获取MP4下载地址链接
以其中一集为例,来获取其MP4下载地址
python
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
import re
import requests
from bs4 import BeautifulSoup
url='http://www.yhdm.io/v/889-1.html'
response=requests.get(url)
html=response.text
soup = BeautifulSoup(html, "lxml")
textlist = soup.select('#playbox ')
text=str(textlist[0])
matchObj = re.findall(r'(http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+\.mp4)',text)
v='['.join(matchObj)
print(v)
使用wget库下载视频
python
1
2
3
4
5
import wget
DATA_URL = 'https://1251316161.vod2.myqcloud.com/007a649dvodcq1251316161/68a349375285890806676748252/lzeAP5gEdj0A.mp4'
DATA_PATH = 'E:/a.mp4'
wget.download(DATA_URL, out=DATA_PATH)
整理合成
将以上三部分程序合成
python
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
import os
from selenium import webdriver
import requests
from bs4 import BeautifulSoup
import re
import wget
header = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36"
}
url = "http://www.yhdm.io/show/889.html"
r = requests.get(url, headers=header)
html = r.text
soup = BeautifulSoup(html, "html.parser")
urls = soup.find("div", class_="movurl").find_all("a")
videourls = []
for u in urls:
videourls.append(u.get("href"))
vstr = "http://www.yhdm.io/"
for index, item in enumerate(videourls):
videourls[index] = vstr + item
u=0
videos = []
for i in videourls:
response=requests.get(videourls[u])
u=u+1
response.encoding='utf-8'
html=response.text
soup = BeautifulSoup(html, "html.parser")
textlist = soup.select('#playbox')
text=str(textlist[0])
matchObj= re.findall(r'(http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+\.mp4)',text)
v='['.join(matchObj)
videos.append(v)
j=0
for i in videourls:
DATA_URL = str(videos[j])
DATA_PATH = 'F:/无限斯特拉托斯'
file_name = str(j+1) + '.mp4'
j=j+1
wget.download(DATA_URL, out=DATA_PATH+file_name)