SDCC2016PPT批量下载 – 微风戏雨

SDCC2016PPT批量下载

2016-11-30 17:32

|

6,029

|

2

|

Python

21 字

|

几秒读完

csdn.py

#coding: utf8
'''
CDSN登录
'''
import re
import requests

head={
    "User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36",
}

def csdn_login(username, password):
    '''参数说明:
        username: csdn账号
        password: csdn的密码
    '''
    url="https://passport.csdn.net/account/login"
    s=requests.session()
    r=s.get(url,headers=head)
    lt_execution_id=re.findall('name="lt" value="(.*?)".*\sname="execution" value="(.*?)"', r.text, re.S)
    payload={
        "username":"%s" % username,
        "password":"%s" % password,
        "lt":lt_execution_id[0][0],
        "execution":lt_execution_id[0][1],
        "_eventId":"submit"
    }
    r2=s.post(url,headers=head,data=payload)
    return s

dl_csdn_meeting_pdf.py

#!/usr/bin/env python
#coding:utf8
#__author__ = "Huang Jinqiang<hzkeung@vip.qq.com"
"""
CSDN会议PPT下载
"""
import os
import re
import sys
import time
import urllib

import csdn

reload(sys)
sys.setdefaultencoding('utf8')

csdn_login=csdn.csdn_login
head = csdn.head

def download(url, s, intodir):
    '''参数说明:
       url: 会议pdf下载地址
       s: requests的session
       intodir: 下载的目标目录
    '''
    #创建目录
    if not os.path.isdir(intodir):
        os.mkdir(intodir)

    #获取html
    def gethtml(url):
        rs=s.get("%s" % url,headers=head)
        result = rs.text
        return result

    html = gethtml(url)
    #获取html源码显示全部的sid号
    sid_reg = r"(\d+),23,'http://meet.download.csdn.net/speech"
    sid_re = re.compile(sid_reg)
    sids = re.findall(sid_re, html)

    for sid in sids:
        result = gethtml("http://download.csdn.net/index.php/meeting/speechlist/?sid=%s" % sid)
        for res in eval(result):
            #获取文件名
            _of = res['originfile']         #获取到的是unicode编码内的字符串, 如u'\uxxx'里的xxx
            of = eval("""u'%s'""" % _of)    #把字符串还原为unicode编码
            #获取下载链接
            time.sleep(5)
            _msg = gethtml('http://download.csdn.net/index.php/meeting/do_download_speech/%s/%s' % (res['id'],res['mid']))
            msg = eval(_msg)
            _pdfurl = msg.get('msg')
            pdfurl = re.sub(r'\\', '', _pdfurl)
            #下载文件
            r = s.get("%s" % pdfurl, headers=head)
            try:
                with open("%s/%s" % (intodir, of), "wb") as code:
                    print "donwloading %s" % of
                    #保存
                    code.write(r.content)
                    print "donwload %s status: ok" % of
            except Exception, e:
                print "donwload %s status: fail" % of
                print(e)

if __name__ == "__main__":
    s = csdn_login('cdsn_user', 'csdn_password') #登录的session,参数用户名密码
    download('http://download.csdn.net/meeting/meeting_detail/23', s, 'SDCC2016')

脚本适用于下载csdn会议的pdf文档

评论

路人甲

8年前
2017-5-20 19:29:02

下载的文件都是几k pdf文件也打不开
- Huang Jinqiang
  博主
  路人甲
  
  8年前
  2017-5-22 10:18:37
  
  你看下自己的csdn账号是否可以正常登录使用，我发现要进行手机号码验证，否则无法正常使用

发送评论编辑评论

Markdown

|´・ω・)ノ

ヾ(≧∇≦*)ゝ

(☆ω☆)

（╯‵□′）╯︵┴─┴

￣﹃￣

(/ω＼)

∠( ᐛ 」∠)＿

(๑•̀ㅁ•́ฅ)

→_→

୧(๑•̀⌄•́๑)૭

٩(ˊᗜˋ*)و

(ノ°ο°)ノ

(´இ皿இ｀)

⌇●﹏●⌇

(ฅ´ω`ฅ)

(╯°A°)╯︵○○○

φ(￣∇￣o)

ヾ(´･･｀｡)ノ"

( ง ᵒ̌皿ᵒ̌)ง⁼³₌₃

(ó﹏ò｡)

Σ(っ °Д °;)っ

( ,,´･ω･)ﾉ"(´っω･｀｡)

╮(╯▽╰)╭

o(*////▽////*)q

＞﹏＜

( ๑´•ω•) "(ㆆᴗㆆ)

颜文字

Emoji

小恐龙

花!