利用爬虫爬取我的solo博客

Updated on in 人生苦短,我用Python with 0 views and 0 comments

选项一直都有 但我选择了你

之前写的对接飞书的接口发送消息,顺便爬取一下博客把文章 title 和 link 爬取出来集成到富文本中。

  • 注意的是只能爬取 solo 的内置皮肤 pingsu 里面的文章
#!/usr/bin/env python
#-*- coding: utf-8 -*-
'''
@file: solo.py
@time: 2020/3/25
@author: cuijianzhe
@contact: 598941324@qq.com
@software: PyCharm-2019.1.3
'''
import urllib.request
from lxml import etree
def get_ArticlesNums():
    url = "https://www.cjzshilong.cn/"
    # //*[@id="pjax"]/div/main/article[1]/div/div[5]/h2/a
    response = urllib.request.urlopen(url)
    html = response.read().decode()
    parseHtml = etree.HTML(html)
    #article_num = parseHtml.xpath('//*[@id="pjax"]/div/main/article')  #单个页面体现出来的文章数
    #遍历单个页数的文章标题
    # for i in range(1,len(article_num)):
    #     titles = parseHtml.xpath('//*[@id="pjax"]/div/main/article[%s]/div/div[5]/h2/a/text()'%i)
    all_article = len(parseHtml.xpath('//*[@id="pjax"]/div/nav/a')) + 1 #文章全部分页数
    return all_article

def get_ArticleInfo(all_article):
    titles_list = []
    links_list = []
    for i in range(1,all_article+1):
        url_1 = "https://www.cjzshilong.cn/?p=%s" % i  # 分页数里面的链接
        response_1 = urllib.request.urlopen(url_1)
        html_1 = response_1.read().decode()
        parseHtml_1 = etree.HTML(html_1)
        article_num = len(parseHtml_1.xpath('//*[@id="pjax"]/div/main/article'))  #返回每页的文章数量
        for n in range(1,article_num+1):
            titles = parseHtml_1.xpath('//*[@id="pjax"]/div/main/article[%s]/div/div[5]/h2/a/text()' % n)
            links = parseHtml_1.xpath('//*[@id="pjax"]/div/main/article[%s]/div/div[5]/h2/a//@href' % n)
            new_titles = ''.join(titles).strip().rsplit('\n')
            links_list.append(links)
            titles_list.append(new_titles)
    return titles_list,links_list
if __name__ == "__main__":
    Article_Nums = get_ArticlesNums()
    articles_info = get_ArticleInfo(Article_Nums)
    print(articles_info[0][1],articles_info[1][1])
  • 对接飞书
# !/bin/env python3
########################################################
#  This script is to send emails to Lijuan regularly   #
#  Date: 2020-2-24                                     #
#  Author: cuijianzhe                                  #
#  Email: 598941324@qq.com                             #
########################################################
import logging
import requests
import json
import os
import random
import linecache
import urllib.request
from lxml import etree

logging.basicConfig(filename='/scripts/feishu/log', level=logging.DEBUG,
                    datefmt='%Y-%m-%d %H:%M:%S',
                    format='%(asctime)s - %(levelname)s - %(lineno)d - %(message)s')
logger = logging.getLogger(__name__)

def get_token():
    data = {"app_id":"cli_9xxxxxxx0d","app_secret":"YJJxxxxxxxi"}
    headers = {"Content-Type": "application/json"}
    url_token = "https://open.feishu.cn/open-apis/auth/v3/tenant_access_token/internal/"
    try:
        res = requests.post(url_token, json=data, headers=headers)
        if res.status_code == 200:
            token = (json.loads(res.text)).get('tenant_access_token')
            return token
    except:
        print('请求失败')
headers_group = {
    "Authorization" : "Bearer %s"%(get_token()),
    "Content-Type" : "application/json"
}

def get_ArticlesNums():
    url = "https://www.cjzshilong.cn"
    # //*[@id="pjax"]/div/main/article[1]/div/div[5]/h2/a
    response = urllib.request.urlopen(url)
    html = response.read().decode()
    parseHtml = etree.HTML(html)
    article_num = parseHtml.xpath('//*[@id="pjax"]/div/main/article')  #单个页面体现出来的文章数
    #遍历单个页数的文章标题
    for i in range(1,len(article_num)):
        titles = parseHtml.xpath('//*[@id="pjax"]/div/main/article[%s]/div/div[5]/h2/a/text()'%i)
    all_article = len(parseHtml.xpath('//*[@id="pjax"]/div/nav/a')) + 1 #全部分页数
    return all_article

def get_ArticleInfo(all_article):
    titles_list = []
    links_list = []
    for i in range(1,all_article+1):
        url_1 = "https://www.cjzshilong.cn/?p=%s" % i  # 分页数里面的链接
        response_1 = urllib.request.urlopen(url_1)
        html_1 = response_1.read().decode()
        parseHtml_1 = etree.HTML(html_1)
        article_num = len(parseHtml_1.xpath('//*[@id="pjax"]/div/main/article'))  #返回每页的文章数量
        for n in range(1,article_num+1):
            titles = parseHtml_1.xpath('//*[@id="pjax"]/div/main/article[%s]/div/div[5]/h2/a/text()' % n)
            links = parseHtml_1.xpath('//*[@id="pjax"]/div/main/article[%s]/div/div[5]/h2/a//@href' % n)
            new_titles = ''.join(titles).strip().rsplit('\n')
            links_list.append(links)
            titles_list.append(new_titles)
    return titles_list,links_list

def getuserid(mobile):   #根据手机号get用户id
    userurl = "https://open.feishu.cn/open-apis/user/v1/batch_get_id?mobiles=%s"%mobile
    res_data = requests.get(url=userurl, headers=headers_group)
    userid = json.loads(res_data.text)['data']['mobile_users'][mobile][0]['user_id']
    return userid

def uploadimg():
    imgname = random.choice(os.listdir('/scripts/feishu/images'))
    # 上传图片接口,get image key
    with open("/scripts/feishu/images/%s"%imgname,'rb') as p:
        image = p.read()
    imgurl = "https://open.feishu.cn/open-apis/image/v4/put/"
    headers = {"Authorization" : "Bearer %s"%get_token()}
    files = {
        'image':image
    }
    imgdata = {
        "image_type": "message"
    }
    resp = requests.post(url=imgurl,headers=headers,files=files,data=imgdata)
    os.remove('/scripts/feishu/images/%s'%imgname)
    resp.raise_for_status()
    content = resp.json()
    return content['data']['image_key']

def sendmess(path,title,link,user_id,image_key=None):
    with open(path, encoding='utf-8') as yuju:
        qinghua = linecache.getline(path, random.randint(1, len(yuju.readlines()))).split('、')[1].strip().replace(' ','  ')
    message_url = "https://open.feishu.cn/open-apis/message/v4/send/"
    # 发送富文本消息
    data = {
        "user_id": user_id,
        "msg_type": "post",
        "content": {
            "post": {
                "zh_cn": {
                    "title": "表情包来了",
                    "content": [
                        [
                            {
                                "tag": "text",
                                "un_escape": True,
                                "text": "%s :"%qinghua
                            },
                        ],
                        [
                            {
                                "tag": "text",
                                "un_escape": True,
                                "text": "今日博文 :"
                            },
                            {
                                "tag": "a",
                                "text": "%s"%title,
                                "href": "%s"%link
                            },
                            {
                                "tag": "at",
                                "user_id": user_id

                            }
                        ],
                        [
                            {
                                "tag": "img",
                                "image_key": image_key,
                                "width": 1200,
                                "height": 1200
                            }
                        ]
                    ]
                }
            }
        }
    }
    request = requests.post(url=message_url, headers=headers_group, json=data)
if __name__ == '__main__':
    logger.info('Started..')
    Article_Nums = get_ArticlesNums()
    articles_info = get_ArticleInfo(Article_Nums)
    num = int(random.randint(1,len(articles_info[0])))
    mobiles = ["18xxxxxx42","17xxxxxxx3"]
    for iphone in mobiles:
        user_ID = getuserid(iphone)
        title = ''.join(articles_info[0][num])
        link = ''.join(articles_info[1][num])
        imgkey = uploadimg()
        sendmess('/scripts/feishu/wenben',title,link,user_ID,imgkey)
    logger.info("Finished!\n")

效果如下:

image.png

Casper 皮肤文章标题以及链接爬取

import urllib.request
from lxml import etree
import random
def get_ArticlesNums():
    url = "https://sszsj.top/"
    response = urllib.request.urlopen(url)
    html = response.read().decode()
    parseHtml = etree.HTML(html)
    all_article = len(parseHtml.xpath('//*[@id="pjax"]/div/nav/a')) + 1 #文章全部分页数
    return all_article
def get_ArticleInfo(all_article):
    titles_list = []
    links_list = []
    for i in range(1,all_article+1):
        url_1 = "https://sszsj.top/?p=%s" % i  # 分页数里面的链接
        response_1 = urllib.request.urlopen(url_1)
        html_1 = response_1.read().decode()
        parseHtml_1 = etree.HTML(html_1)
        article_num = len(parseHtml_1.xpath('//*[@id="pjax"]/div/div/article'))  #返回每页的文章数量
        for n in range(1,article_num+1): #遍历单个页数的包含的文章标题以及链接
            titles = parseHtml_1.xpath('//*[@id="pjax"]/div/div/article[%s]/div/h2/a/text()' % n)
            links = parseHtml_1.xpath('//*[@id="pjax"]/div/div/article[%s]/div/h2/a//@href' % n)
            new_titles = ''.join(titles).strip().rsplit('\n')
            links_list.append(links)
            titles_list.append(new_titles)
    return titles_list,links_list
if __name__ == "__main__":
    Article_Nums = get_ArticlesNums()
    articles_info = get_ArticleInfo(Article_Nums)
    num = int(random.randint(1, len(articles_info[0])))
    print(articles_info[0][num],articles_info[1][num])
搭讪 你就破功了,老弟!