Python B站视频爬虫#

非大会员番剧爬虫#

基于上一篇的视频爬虫改过来的，不能爬取大会员番剧，需要安装 ffmpeg！

需要填写番剧网址上的番剧编号并选择番剧的编号前缀

在b站反爬更新之前应该有效

1
# -*- coding = utf-8 -*-
2
# @Author:Hycer_Lance
3
# @Time:2022/4/3 20:49
4
# @Software:PyCharm
5

6
import sys
7
import time
8
import requests
9
import re
10
import json
11
import subprocess
12
import os
13
import PySimpleGUI as sg
14

15
def select_mode():
16
    # 选择下载模式
17
    try:
18
        print('Select download mode')
19
        print('___1___    ___2___')
20
        print('Single     Multiple')
21
        download_mode = input('Select:>')
22
        if download_mode == '2':
23
            num = int(input("Enter the number of episodes to download:"))
24
        elif download_mode == '1':
25
            num = 1
26
        else:
27
            print('No such mode')
28
            sys.exit()
29
        return num
30
    except Exception as result:
31
        print(result)
32
        print("Please retry")
33
        sys.exit()
34

35
# 判断编号前缀
36
def determine_prefix():
37
    # 判断编号前缀
38
    try:
39
        number_type = input("Select number type (ep/ss):")
40
        if number_type == 'ep':
41
            flag = 'ep'
42
        elif number_type == 'ss':
43
            flag = 'ss'
44
        else:
45
            print("An unrecognized type")
46
            sys.exit()
47
        return flag
48
    except Exception as result:
49
        print(result)
50
        print("Please retry")
51
        sys.exit()
52

53
# 获取网页源码
54
def get_html_res(flag, av):
55
    try:
56
        baseurl = "https://www.bilibili.com/bangumi/play/" + flag + av
57
        print("The video URL:", baseurl)
58
        html_res = requests.get(baseurl, headers=headers).text  # 获取网页源码
59
        # print(html_res)
60
        return html_res
61
    except Exception as result:
62
        print(result)
63
        print("Please retry")
64
        sys.exit()
65

66
# 获取番剧信息
67
def get_playinfo(html_res):
68
    try:
69
        playinfo = re.findall("<script>window.__playinfo__=(.*?)</script>", html_res)[0]  # 正则表达式提取音视频网址
70
        return playinfo
71
    except Exception as result:
72
        print(result)
73
        print("Please retry")
74
        sys.exit()
75

76
# 转换成json文件
77
def conversion_json(playinfo):
78
    try:
79
        json_data = json.loads(playinfo)  # 转换为json数据
80
        return json_data
81
    except Exception as result:
82
        print(result)
83
        print("Please retry")
84
        sys.exit()
85

86
# 获取番剧名字
87
def get_title1(html_res):
88
    try:
89
        title1 = re.findall('<title>(.*?)-番剧-全集-高清正版在线观看-bilibili-哔哩哔哩</title><meta name="description"', html_res)[
90
            0]  # 提取标题 并取第一个元素（变成字符串）
91
        print("The video title:", title1)
92
        return title1
93
    except Exception as result:
94
        print(result)
95
        print("Please retry")
96
        sys.exit()
97

98
# 提取资源地址
99
def get_video_url(json_data):
100
    try:
101
        video_url = json_data['data']['dash']['video'][0]['baseUrl']  # 提取视频地址
102
        print("Get video URL successfully")
103
        return video_url
104
    except Exception as result:
105
        print(result)
106
        print("Please retry")
107
        sys.exit()
108

109
def get_audio_url(json_data):
110
    try:
111
        audio_url = json_data['data']['dash']['audio'][0]['baseUrl']  # 提取音频地址
112
        print("Get audio URL successfully")
113
        return audio_url
114
    except Exception as result:
115
        print(result)
116
        print("Please retry")
117
        sys.exit()
118

119

120
# 下载数据
121
def download_data(url):
122
    try:
123
        get_data = requests.get(url=url, headers=headers).content  # 以二进制获取视频
124
        return get_data
125
    except Exception as result:
126
        print(result)
127
        print("Please retry")
128
        sys.exit()
129

130

131
# 保存数据
132
def save_data(title2, video_data, audio_data):
133
    try:
134
        # 保存音视频
135
        print("Start writing video data...")
136
        with open(title2 + '_video.mp4', mode='wb') as f:
137
            f.write(video_data)
138
        print("Write video data successfully")
139
        time.sleep(1)
140

141
        print("Start writing audio data...")
142
        with open(title2 + '_audio.mp3', mode='wb') as f:
143
            f.write(audio_data)
144
        print("Write video data successfully")
145
        time.sleep(1)
146
    except Exception as result:
147
        print(result)
148
        print("Please retry")
149
        sys.exit()
150

151

152
# 合成音视频
153
def synthesize(title2, title1):
154
    try:
155
        # 利用子进程调用系统的ffmpeg合成音视频
156
        print("Start synthesizing audio and video...")
157
        # 用bv号命名合成放在因标题的非法字符导致命令报错
158
        subprocess.call(
159
            f"ffmpeg -i {title2}_video.mp4 -i {title2}_audio.mp3 -c:v copy -c:a aac -strict experimental {title2}.mp4")
160
        os.rename(f'{title2}.mp4', f'{title1}.mp4')  # 给文件重命名
161
        print("Synthesize audio and video successfully")
162
        print(f'The video {title1} is saved in ' + os.getcwd())
163
        time.sleep(2)
164

165
    except Exception as result:
166
        print(result)
167
        print("Please retry")
168
        sys.exit()
169

170

171
# 删除单独的音视频
172
def delete_res(title2):
173
    try:
174
        os.remove(os.getcwd() + f'/{title2}_video.mp4')
175
        os.remove(os.getcwd() + f'/{title2}_audio.mp3')  # 删除音视频
176
    except Exception as result:
177
        print(result)
178
        print("Failed to delete audio and video files, you can delete them manually")
179

180

181
if __name__ == '__main__':
182
    # 选择下载模式
183
    num = select_mode()
184
    # 输入番剧编号
185
    av = input("Input start anime number:")
186
    # 判断编号前缀
187
    flag = determine_prefix()
188
    # 循环下载
189
    for i in range(num):
190
        # 请求头
191
        headers = {"user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) "
192
                                 "Chrome/100.0.4896.60 Safari/537.36",
193
                   "referer": "https://www.bilibili.com/bangumi"}
194
        html_res = get_html_res(flag, av)  # 以文本形式获取网页源码
195

196
        title1 = get_title1(html_res)  # 正则获取番剧标题
197
        title2 = flag + av  # 用于给资源命名
198
        av = str(int(av) + 1)  # 多集下载时确定下一集的编号 一般是递增但有些番剧师乱序的
199
        playinfo = get_playinfo(html_res)  # 从网页源码中找到番剧信息
200
        json_data = conversion_json(playinfo)  # 将信息转为json数据格式
201
        video_url = get_video_url(json_data)  # 从json数据中获取视频地址
202
        audio_url = get_audio_url(json_data)  # 获取音频地址
203
        # 下载二进制文件
204
        print("Starting download video data...")
205
        video_data = download_data(video_url)
206
        print("Download video data successfully")
207
        print('--------------------------------')
208
        print("Starting download audio data...")
209
        audio_data = download_data(audio_url)
210
        print("Download audio data successfully")
211

212
        save_data(title2, video_data, audio_data)  # 保存数据
213
        synthesize(title2, title1)  # 合成音视频
214
        delete_res(title2)  # 删除单独的音视频

B站视频爬虫#

b站上的视频都是音视频分离的电脑上需要安装ffmpeg来合成下载下来的音视频在b站反爬更新之前可以成功爬取视频不能爬番剧

1
# -*- coding = utf-8 -*-
2
# @Author:Hycer_Lance
3
# @Time:2022/4/2 16:08
4
# @Software:PyCharm
5
import sys
6
import time
7

8
import requests
9
import re#正则表达式
10
import json
11
import pprint#格式化输出方便查看数据
12
import subprocess
13
import os
14

15
try:
16
    bv=input("BV:")
17
    baseurl="https://www.bilibili.com/video/"+bv
18
    print("The video URL:", baseurl)
19
    headers={"user-agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.60 Safari/537.36",
20
             "referer": "https://www.bilibili.com",
21
             "cookie": "buvid3=37DE417F-8350-C04A-154A-12B7B50B5ADF50216infoc; i-wanna-go-back=-1; b_ut=7; "
22
                       "_uuid=61023FFA5-163E-75A5-6B39-1010FD7D85B75463413infoc; "
23
                       "buvid4=AC529F3A-9AA5-3D76-C31D-2BD5C52F3B9A51558-022040218-OuD1af3VG92lX1kdOOSHzg%3D%3D; "
24
                       "CURRENT_BLACKGAP=0; blackside_state=1; rpdid=|(k|~JkmRJ~Y0J'uYR))mYRYk; innersign=0; "
25
                       "b_lsid=8A8CEBAD_17FEF73F50C; fingerprint=b5f6c1bb5af1df68d53c77ffdeedfb8c; "
26
                       "buvid_fp_plain=undefined; SESSDATA=5f313593%2C1664541835%2C6f7a2%2A41; "
27
                       "bili_jct=ad3a2931c76e1b09b32821a684b482c7; DedeUserID=259424943; "
28
                       "DedeUserID__ckMd5=0c3254f433bfda4b; sid=at4dozdd; buvid_fp=b5f6c1bb5af1df68d53c77ffdeedfb8c; "
29
                       "CURRENT_FNVAL=4048; CURRENT_QUALITY=16; PVID=3"}
30

31
    html_res=requests.get(baseurl, headers=headers).text#获取网页源码
32

33
    title1=re.findall('<h1 title="(.*?)" class="video-title">', html_res)[0]#提取标题 并取第一个元素（变成字符串）
34
    print("The video title:", title1)
35
    title2=bv
36
except Exception as result:
37
    print(result)
38
    print("Please retry")
39
    sys.exit()
40

41
try:
42
    playinfo=re.findall("<script>window.__playinfo__=(.*?)</script>", html_res)[0]#正则表达式提取音视频网址
43
    json_data=json.loads(playinfo)#转换为json数据
44
    # pprint.pprint(json_data)
45

46
    video_url=json_data['data']['dash']['video'][0]['baseUrl']#提取视频地址
47
    print("Get video URL successfully")
48
    audio_url=json_data['data']['dash']['audio'][0]['baseUrl']#提取音频地址
49
    print("Get audio URL successfully")
50
except Exception as result:
51
    print(result)
52
    print("Please retry")
53
    sys.exit()
54

55
try:
56
    print("Start downloading video data...")
57
    get_video=requests.get(url=video_url, headers=headers).content#以二进制获取视频
58
    print("Download video data successfully")
59
    time.sleep(1.5)
60

61
    print("Start downloading audio data...")
62
    get_audio=requests.get(url=audio_url, headers=headers).content#以二进制获取音频
63
    print("Download video data successfully")
64
    time.sleep(1.5)
65
except Exception as result:
66
    print(result)
67
    print("Please retry")
68
    sys.exit()
69

70
try:
71
    #保存音视频
72
    print("Start writing video data...")
73
    with open(title2+'_video.mp4', mode='wb') as f:
74
        f.write(get_video)
75
    print("Write video data successfully")
76
    time.sleep(1)
77

78
    print("Start writing audio data...")
79
    with open(title2+'_audio.mp3', mode='wb') as f:
80
        f.write(get_audio)
81
    print("Write video data successfully")
82
    time.sleep(1)
83
except Exception as result:
84
    print(result)
85
    print("Please retry")
86
    sys.exit()
87

88
try:
89
    #利用子进程调用系统的ffmpeg合成音视频
90
    print("Start synthesizing audio and video...")
91
    #用bv号命名合成放在因标题的非法字符导致命令报错
92
    subprocess.call(f"ffmpeg -i {title2}_video.mp4 -i {title2}_audio.mp3 -c:v copy -c:a aac -strict experimental {title2}.mp4")
93
    os.rename(f'{title2}.mp4', f'{title1}.mp4')#给文件重命名
94
    print("Synthesize audio and video successfully")
95
    print(f'The video {title1} is saved in '+os.getcwd())
96
    time.sleep(2)
97

98
except Exception as result:
99
    print(result)
100
    print("Please retry")
101
    sys.exit()
102

103
try:
104
    os.remove(os.getcwd() +f'/{title2}_video.mp4')
105
    os.remove(os.getcwd() +f'/{title2}_audio.mp3')# 删除音视频
106
except Exception as result:
107
    print(result)
108
    print("Failed to delete audio and video files, you can delete them manually")