python爬取网易云音乐、黑客派网站

Updated on in 人生苦短,我用Python with 81 views and 1 comments

****# 虽然没什么用,学习阶段,练手。

爬取黑客派排行榜

import requests
import re
html =  requests.get('https://hacpai.com/top/general').text
result = re.findall('class="fn-flex-1".*?aria-name="(.*?)".*?href="(.*?)".*?',html,flags=re.S)
#paihang_str = str(result)
for value in result:
    name, url = value
    tup1 = name.replace("/"," "),url
    tup2 = ' '.join(tup1)
    print(tup2)
    with open('paihangbang.txt','a',encoding='utf-8') as f:
        f.write(tup2)

效果如下:
image.png

爬取豆瓣电影排行榜

import requests

import re
content = requests.get('https://movie.douban.com/chart').text
# 豆瓣电影排行榜
pattern = re.compile('class="pl2".*?<.*?="(.*?)".*?>(.*?)<span.*?>(.*?)</span>.*?"rating_nums">(.*?)</span>.*?"pl">(.*?)</span>', re.S)
# compile可以在多次使用中提高效率,这里影响不大
results = re.findall(pattern, content)
for result in results:
    url, name1, name2, nums, pl = result
    print(url, name1.replace("/","").strip(), name2.replace("/","").strip(), nums, pl)

python 带 cookie 登录黑客派

import requests
from fake_useragent import UserAgent

headers = {
    'Referer': 'https://hacpai.com/',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36',
}

data = {
    "nameOrEmail":"cuijianzhe",
    "userPassword":"54949cdcbe4d252ba3400897883df589811",
    "captcha":""
}

request = requests.session()

base_url = "https://hacpai.com/login"
res = requests.post(base_url,headers=headers,json=data)
res.raise_for_status()

print(res.text)

二、 爬取网易云音乐

  • 第一步:先把网络请求 get 下来
#http://music.163.com/song/media/outer/url?id=1377519494  #对外开放的下载接口

import requests
import urllib.request  #进行网络数据下载到本地
from fake_useragent import UserAgent
import re

def getResponse(url,headers):
    '''
    :return: html信息
    '''
    try:
        response = requests.get(url=url,headers=headers)
        if response.status_code == 200:
            return response
        return None
    except:
        return None

if __name__ == '__main__':
    url = 'https://music.163.com/song?id=1377519494'
    headers = {
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36',
    }
    res = getResponse(url,headers)
    res
    print(res.text)

歌曲名信息:
image.png

  • 第二步:
……
def getSongName(songid):
    headers = {
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36',
    }
    url = 'https://music.163.com/song?id={}'.format(songid)
    Text = getResponse(url,headers=headers).text
    titile = re.findall('<title>(.*?)</title>',Text,re.S)
    name = titile[0].split('-')[0]
    return name.strip()
……

if __name__ == '__main__':
    name = getSongName(1377519494)
    print(name)

获取歌曲名称:
image.png

  • 第三步下载单曲:
……
if __name__ == '__main__':
    songid = input('请输入要下载的歌曲id:')
    url = 'http://music.163.com/song/media/outer/url?id={}.mp3'.format(songid)
    headers = {
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36',
    }
    down_url = getResponse(url,headers).url
    SongName = getSongName(songid)
    urllib.request.urlretrieve(down_url,SongName+'.mp3')
……

下载成功:
image.png

  • 完整代码:
import requests
import urllib.request  #进行网络数据下载到本地
from fake_useragent import UserAgent
import re
def getResponse(url,headers):
    '''
    :return: html信息
    '''
    try:
        response = requests.get(url=url,headers=headers)
        if response.status_code == 200:
            return response
        return None
    except:
        return None

def getSongName(songid):
    headers = {
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36',
    }
    url = 'https://music.163.com/song?id={}'.format(songid)
    Text = getResponse(url,headers=headers).text
    titile = re.findall('<title>(.*?)</title>',Text,re.S)
    name = titile[0].split('-')[0]
    return name.strip()

if __name__ == '__main__':
    songid = input('请输入要下载的歌曲id:')
    url = 'http://music.163.com/song/media/outer/url?id={}.mp3'.format(songid)
    headers = {
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36',
    }
    down_url = getResponse(url,headers).url
    SongName = getSongName(songid)
    urllib.request.urlretrieve(down_url,SongName+'.mp3') #网上歌曲映射到本地

爬取热门歌单并批量下载

import requests
import re
from multiprocessing import Pool

headers = {
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36',
}
#todo:下载整个歌单网站信息
def get_page(url):
    res = requests.get(url,headers=headers)
    data = re.findall('<a title="(.*?)" href="/playlist\?id=(\d+)" .*?</a>',res.text)
    print(data)
    pool = Pool(4)
    pool.map(get_songs,data)
#todo:获取整个歌单歌曲并进行下载
def get_songs(data):
    playlist_url = 'https://music.163.com/playlist?id=%s'%data[1]
    res = requests.get(playlist_url,headers=headers)
    for i in re.findall('<a href="/song\?id=(\d+)">(.*?)</a>',res.text):
        down_url = 'http://music.163.com/song/media/outer/url?id=%s'%i[0]
        print(down_url)
        try:
            with open('./music/'+i[1]+'.mp3','wb') as f:
                f.write(requests.get(down_url,headers=headers).content)
        except:
            pass

if __name__ == '__main__':
    playlist_url = 'https://music.163.com/discover/playlist/?order=hot'
    get_page(playlist_url)

爬取网易云推荐歌单

import requests
import re
from multiprocessing import Pool
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36',
}
music_list = []
def get_page(url):
    res = (requests.get(url,headers=headers)).text
    data = re.findall('<a title="(.*?)" class="tit s-fc0" href="/playlist\?id=(\d+)"',res)
    print(data)
    pool = Pool(4)
    pool.map(get_song,data)

def get_song(data):
    gedan_url = 'https://music.163.com//playlist?id=%s'%data[1]
    res = requests.get(gedan_url,headers=headers)
    for i in re.findall('<a href="/song\?id=(\d+)">(.*?)</a>',res.text):
        down_url = 'http://music.163.com/song/media/outer/url?id=%s'%i[0]
        print(down_url)
        try:
            with open("./music/"+i[1]+".mp3","wb") as f:
                f.write(requests.get(down_url,headers=headers).content)
        except:
            pass

if __name__ == '__main__':
    my_url = 'https://music.163.com/discover'
    get_page(my_url)

到头来 我们记住的 不是敌人的攻击 而是朋友的沉默 ---马丁·路德·金

Responses