python爬虫实例

Updated on in 人生苦短,我用Python with 141 views and 1 comments

****# 虽然没什么用,学习阶段,练手。

爬取表情包

爬取热门表情

import requests
from bs4 import BeautifulSoup
import re
url = "https://www.fabiaoqing.com/biaoqing"
headers = {
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36',
    "Referer": "https://www.baidu.com"
}

def download_all_htmls():
    """
    下载所有列表页面的HTML,用于后续的分析
    """
    htmls = []
    for idx in range(10):
        url = f"https://fabiaoqing.com/biaoqing/lists/page/{idx+1}.html"
        print("craw html:", url)
        r = requests.get(url)
        if r.status_code != 200:
            raise Exception("error")
        htmls.append(r.text)
    print("success")
    return htmls
htmls = download_all_htmls()

def parse_html(html):
    """
        解析单个HTML,得到数据
        @return list((img_title, img_url))
    """
    soup = BeautifulSoup(html,'html.parser')
    img_divs = soup.find_all("div",class_="tagbqppdiv")
    datas = []
    for img_div in img_divs:
        img_node = img_div.find("img")
        if not img_node: continue
        datas.append((img_node["title"], img_node["data-original"]))
    return datas

# 执行所有的HTML页面的解析
all_imgs = []
for html in htmls:
    all_imgs.extend(parse_html(html))

for idx,(title,img_url) in enumerate(all_imgs):
    # 移除标点符号,只保留中文、大小写字母和阿拉伯数字
    reg = "[^0-9A-Za-z\u4e00-\u9fa5]"
    title = re.sub(reg, '', title)
    # 发现了超长的图片标题,做截断
    if len(title)>10:title = title[:10]
    # 得到jpg还是gif后缀
    post_fix = img_url[-3:]
    filename = f"./output/{title}.{post_fix}"

    print(idx, filename)
    img_data = requests.get(img_url)
    with open(filename, "wb")as f:
        f.write(img_data.content)
print("success")

爬取黑客派排行榜

import requests
import re
html =  requests.get('https://hacpai.com/top/general').text
result = re.findall('class="fn-flex-1".*?aria-name="(.*?)".*?href="(.*?)".*?',html,flags=re.S)
#paihang_str = str(result)
for value in result:
    name, url = value
    tup1 = name.replace("/"," "),url
    tup2 = ' '.join(tup1)
    print(tup2)
    with open('paihangbang.txt','a',encoding='utf-8') as f:
        f.write(tup2)

**效果如下:**
image.png

爬取豆瓣电影排行榜

import requests

import re
content = requests.get('https://movie.douban.com/chart').text
# 豆瓣电影排行榜
pattern = re.compile('class="pl2".*?<.*?="(.*?)".*?>(.*?)<span.*?>(.*?)</span>.*?"rating_nums">(.*?)</span>.*?"pl">(.*?)</span>', re.S)
# compile可以在多次使用中提高效率,这里影响不大
results = re.findall(pattern, content)
for result in results:
    url, name1, name2, nums, pl = result
    print(url, name1.replace("/","").strip(), name2.replace("/","").strip(), nums, pl)

python 带 cookie 登录黑客派

import requests
from fake_useragent import UserAgent

headers = {
    'Referer': 'https://hacpai.com/',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36',
}

data = {
    "nameOrEmail":"cuijianzhe",
    "userPassword":"54949cdcbe4d252ba3400897883df589811",
    "captcha":""
}

request = requests.session()

base_url = "https://hacpai.com/login"
res = requests.post(base_url,headers=headers,json=data)
res.raise_for_status()

print(res.text)

二、 爬取网易云音乐

  • 第一步:先把网络请求 get 下来
#http://music.163.com/song/media/outer/url?id=1377519494  #对外开放的下载接口

import requests
import urllib.request  #进行网络数据下载到本地
from fake_useragent import UserAgent
import re

def getResponse(url,headers):
    '''
    :return: html信息
    '''
    try:
        response = requests.get(url=url,headers=headers)
        if response.status_code == 200:
            return response
        return None
    except:
        return None

if __name__ == '__main__':
    url = 'https://music.163.com/song?id=1377519494'
    headers = {
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36',
    }
    res = getResponse(url,headers)
    res
    print(res.text)

歌曲名信息:
image.png

  • 第二步:
……
def getSongName(songid):
    headers = {
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36',
    }
    url = 'https://music.163.com/song?id={}'.format(songid)
    Text = getResponse(url,headers=headers).text
    titile = re.findall('<title>(.*?)</title>',Text,re.S)
    name = titile[0].split('-')[0]
    return name.strip()
……

if __name__ == '__main__':
    name = getSongName(1377519494)
    print(name)

获取歌曲名称:
image.png

  • 第三步下载单曲:
……
if __name__ == '__main__':
    songid = input('请输入要下载的歌曲id:')
    url = 'http://music.163.com/song/media/outer/url?id={}.mp3'.format(songid)
    headers = {
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36',
    }
    down_url = getResponse(url,headers).url
    SongName = getSongName(songid)
    urllib.request.urlretrieve(down_url,SongName+'.mp3')
……

下载成功:
image.png

  • 完整代码:
import requests
import urllib.request  #进行网络数据下载到本地
from fake_useragent import UserAgent
import re
def getResponse(url,headers):
    '''
    :return: html信息
    '''
    try:
        response = requests.get(url=url,headers=headers)
        if response.status_code == 200:
            return response
        return None
    except:
        return None

def getSongName(songid):
    headers = {
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36',
    }
    url = 'https://music.163.com/song?id={}'.format(songid)
    Text = getResponse(url,headers=headers).text
    titile = re.findall('<title>(.*?)</title>',Text,re.S)
    name = titile[0].split('-')[0]
    return name.strip()

if __name__ == '__main__':
    songid = input('请输入要下载的歌曲id:')
    url = 'http://music.163.com/song/media/outer/url?id={}.mp3'.format(songid)
    headers = {
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36',
    }
    down_url = getResponse(url,headers).url
    SongName = getSongName(songid)
    urllib.request.urlretrieve(down_url,SongName+'.mp3') #网上歌曲映射到本地

爬取热门歌单并批量下载

import requests
import re
from multiprocessing import Pool

headers = {
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36',
}
#todo:下载整个歌单网站信息
def get_page(url):
    res = requests.get(url,headers=headers)
    data = re.findall('<a title="(.*?)" href="/playlist\?id=(\d+)" .*?</a>',res.text)
    print(data)
    pool = Pool(4)
    pool.map(get_songs,data)
#todo:获取整个歌单歌曲并进行下载
def get_songs(data):
    playlist_url = 'https://music.163.com/playlist?id=%s'%data[1]
    res = requests.get(playlist_url,headers=headers)
    for i in re.findall('<a href="/song\?id=(\d+)">(.*?)</a>',res.text):
        down_url = 'http://music.163.com/song/media/outer/url?id=%s'%i[0]
        print(down_url)
        try:
            with open('./music/'+i[1]+'.mp3','wb') as f:
                f.write(requests.get(down_url,headers=headers).content)
        except:
            pass

if __name__ == '__main__':
    playlist_url = 'https://music.163.com/discover/playlist/?order=hot'
    get_page(playlist_url)

爬取网易云推荐歌单

import requests
import re
from multiprocessing import Pool
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36',
}
music_list = []
def get_page(url):
    res = (requests.get(url,headers=headers)).text
    data = re.findall('<a title="(.*?)" class="tit s-fc0" href="/playlist\?id=(\d+)"',res)
    print(data)
    pool = Pool(4)
    pool.map(get_song,data)

def get_song(data):
    gedan_url = 'https://music.163.com//playlist?id=%s'%data[1]
    res = requests.get(gedan_url,headers=headers)
    for i in re.findall('<a href="/song\?id=(\d+)">(.*?)</a>',res.text):
        down_url = 'http://music.163.com/song/media/outer/url?id=%s'%i[0]
        print(down_url)
        try:
            with open("./music/"+i[1]+".mp3","wb") as f:
                f.write(requests.get(down_url,headers=headers).content)
        except:
            pass

if __name__ == '__main__':
    my_url = 'https://music.163.com/discover'
    get_page(my_url)

到头来 我们记住的 不是敌人的攻击 而是朋友的沉默 ---马丁·路德·金