Perl - 获取/下载知乎视频

Perl - 获取/下载知乎视频

示例(这id名字取得 。。。)

哥不穿内裤:你看过哪些和以往认知大相径庭的科普视频?


问:如何知道浏览器获取视频的整个过程?

答:打开火狐的调试工具(按F12),选择[网络],然后每个 get 和 post 都去了解一下


流程:

  • 访问答案页面 -> 获取视频页面编号
  • 获取 oauth 值 (在答案页面中有个名称中含有 main.app 的 js,其中包含 oauth 值)。
  • 获取 m3u8 的链接地址,此时需要用到 oauth
my $res = $ua->get( 
                $main .$pgcode, 
                "authorization" => $oauth,
            );

my $data = decode_json( $res->content );
my $play_url = $data->{playlist}->{sd}->{play_url};  # m3u8 url
  • 然后在 m3u8 中包含所有视频的切片地址,逐个下载,合并后保存到单个文件。


完整代码:

=info
    Author: 523066680
    Date: 2018-05
=cut

use Modern::Perl;
use LWP::UserAgent;
use File::Slurp;
use JSON;
STDOUT->autoflush(1);

goto_dir("D:/temp");
our $main = "https://lens.zhihu.com/api/videos/";
our $ua = LWP::UserAgent->new(  );
our $target = "https://www.zhihu.com/question/271736973/answer/391332001";

my $res = $ua->get( $target );
my $html = $res->content();
my @video = $html=~/>https:.*?video\/(\d+)</g;
my $oauth = get_oauth( $html );

for my $idx ( 0 .. $#video )
{
    printf "Getting video %s - %s\n", $idx, $video[$idx];
    my @vlinks = get_video_links(  $oauth, $video[$idx] );
    get_video( @vlinks );
}

# 获取 m3u8 列表并提取链接
sub get_video_links
{
    our ($main, $ua);
    my ( $oauth, $pgcode ) = @_;

    my $res = $ua->get( 
                $main .$pgcode, 
                "authorization" => $oauth,
            );

    die unless $res->is_success();

    my $data = decode_json( $res->content );
    my $play_url = $data->{playlist}->{sd}->{play_url};  # m3u8 url
    my $pre_url;

    # 获取网址共用部分
    $play_url =~/(.*?\w{32})/;  
    $pre_url = $1 ."/";

    $res = $ua->get( $play_url );
    my @vlinks = $res->content =~/\n(.*?\d+\.ts.*?)\n/g;
    grep { $_ = $pre_url . $_ } @vlinks;

    return $pgcode, @vlinks;
}

# 获取视频切片,合并
sub get_video
{
    our $ua;
    my $name = shift;
    my $buff = "";
    my $res;

    while ( my $link = shift )
    {
        print $#_ + 1 ," ";
        $res = $ua->get( $link );
        $buff .= $res->content();
    }
    print "\n";

    write_file( "${name}.ts", {binmode=>":raw"}, $buff );
}

sub get_oauth
{
    our ( $ua );
    my $html = shift;
    my ($js) = $html =~/(https:[^<>]+main\.app[^<>]+js)/g;
    my $res = $ua->get( $js );
    # pattern: authorization:"oauth c3cef7c66a1843f8b3a9e6a1e3160e20"}
    my ($oauth) = $res->content =~/authorization:"([^"]{30,})"/;
    return $oauth
}

sub goto_dir
{
    my $dir = shift;
    mkdir $dir unless ( -e $dir );
    chdir $dir;
}

__DATA__


2018-10-16 更新,现在简单多了,单个MP4文件

=info
    Author: 523066680
    2018-07 知乎去掉了 oauth 授权方式
    2018-10 从 ts 多文件,变更为 mp4 单文件下载
=cut

use JSON;
use Encode qw/from_to/;
use LWP::UserAgent;
use Mojo::DOM;
use File::Slurp;
STDOUT->autoflush(1);

our $wdir = "D:/temp";
our $main = "https://lens.zhihu.com/api/videos/";
our $ua = LWP::UserAgent->new();
our $target = "https://www.zhihu.com/question/271736973/answer/389377346";
#our $target = "https://www.zhihu.com/question/285103979/answer/492401516";
#our $target = "https://www.zhihu.com/question/278030511/answer/452274063";

my $res = $ua->get( $target );
my $html = $res->content();
my @video = $html=~/>https:.*?video\/(\d+)</g;  # 获取视频页面链接
my $title = get_title_name( $html );
my ($answerID) = ($target=~/\/(\d+)$/);

mkdir $wdir unless -e $wdir;
chdir $wdir;

for my $idx ( 0 .. $#video )
{
    printf "Getting video %s - %s\n", $idx, $video[$idx];
    get_video( $video[$idx], "${title}Answer_${answerID}_${idx}.mp4" );
}

sub get_video
{
    our ($main, $ua);
    my ( $pgcode, $fname ) = @_;
 
    my $res = $ua->get( $main .$pgcode );
    die unless $res->is_success();

    my $data = decode_json( $res->content );
    my $play_url = $data->{playlist}->{sd}->{play_url}; 

    $res = $ua->get( $play_url );
    write_file( $fname, {binmode=>":raw"}, $res->content );
}

sub get_title_name
{
    my $html = shift;
    my $dom = Mojo::DOM->new($html);
    my $title = $dom->at("title")->text;
    $title =~s/ - 知乎//;
    from_to( $title, "utf8", "gbk" );
    return $title;
}

编辑于 2018-10-16

文章被以下专栏收录