Contents

公众号文章爬取

需求:对指定公众号的所有文章进行爬取。

参考资料

【Python爬虫】微信公众号文章爬取思路分析与实践_哔哩哔哩_bilibili

方案一

  1. 进入自己的公众号平台——>内容和互动——>草稿箱

  2. 点击插入超链接——>公众号一行,选择其他公众号——>输入需要爬取的公众号并搜索

  1. F12,选择网络——>点击翻页,观察网络包——>在标头的请求url中获取fakeid和token和cookie

  1. 将上面获取的几项替换到代码中几项即可

code


# coding=utf-8
# -*- coding:uft-8 -*-
# 微信公众号

import requests
import datetime
from time import sleep
from lxml import etree
import pandas as pd
import re
import os


def validateTitle(tit):
    reStr = r'[\/\\\:\*\?\"\<\>\|]'
    newTitle = re.sub(reStr, '_', tit)
    return newTitle


start = 1
end = 10
fakeid = 'xxx=='
token = 'xxx'
cookie = 'xxx'
ua = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.88 Safari/537.36'

path = 'data'
if not os.path.exists(path):
    os.makedirs(path)
# resLs = []
for i in range(start - 1, end + 1):
    try:
        i *= 5
        url = 'https://mp.weixin.qq.com/cgi-bin/appmsg?action=list_ex&begin={}&count=5&fakeid={}&type=9&query=&token={}&lang=zh_CN&f=json&ajax=1'.format(i, fakeid, token)
        headers = {
            'authority': 'mp.weixin.qq.com',
            'cookie': cookie,
            'referer': 'https://mp.weixin.qq.com/cgi-bin/appmsg?t=media/appmsg_edit_v2&action=edit&isNew=1&type=10&token={}&lang=zh_CN'.format(token),
            'user-agent': ua
        }
        res = requests.get(url=url, headers=headers).json()
        for k in res['app_msg_list']:
            timeStamp = k['update_time']
            dateArray = datetime.datetime.fromtimestamp(timeStamp)
            time = dateArray.strftime('%Y-%m-%d %H:%M:%S')
            u = k['link']
            hd = {
                'User-Agent': ua,
                'cookie': cookie
            }
            r = requests.get(url=u, headers=hd).content.decode('utf-8')
            tree = etree.HTML(r)
            title = tree.xpath('//h1/text()')[0].strip()
            content = '\n'.join(tree.xpath('//div[@id="js_content"]//text()')).strip()
            dic = {
                '标题': title,
                '时间': time,
                '内容': content,
                '链接': u
            }
            # resLs.append(dic)
            print(dic)
            open(f'data/{validateTitle(title)}.txt', 'a', encoding='utf-8').write(f'{title}\n{time}\n{content}')
            sleep(5)
    except Exception as e:
        print(str(e))
        sleep(5)
        break
    # df = pd.DataFrame(resLs)
    # writer = pd.ExcelWriter('微信公众号.xlsx')
    # df.to_excel(writer, encoding='utf-8', index=False)
    # writer.save()