python公众号文章_Python 抓取微信公众号文章

python公众号文章_Python 抓取微信公众号文章

2024-12-21 09:56

起因是刷微信的时候看到一篇文章，Python 抓取微信公众号文章保存成pdf，很容易搜到，就不贴出来了

先用chrome登陆微信公众号后台，先获取一下自己的cookie，复制下来就行，解析一下转换成 Json 格式写入文本

import json

# 自己公众号的 cookie 字符串

cookie_str = "粘贴到这里"

cookie = {}

for cookies in cookie_str.split("; "):

cookie_item = cookies.split("=")

cookie[cookie_item[0]] = cookie_item[1]

# 将cookies写入到本地文件

with open('cookies.txt', "w") as file:

file.write(json.dumps(cookie))

然后在新建图文消息的地方插入超链接，搜索想要的微信公众号名称后选一篇文章，F12可以看到一些信息，begin 是从第几篇文章开始，count 是一次查出几篇，fakeId 对应这个公号的唯一 Id，token 是通过 cookie 信息来获取的。

核心代码如下

# gzh_download.py

#-*- coding = utf-8 -*-

# 引入模块

import requests

import json

import re

import random

import time

import pdfkit

# 打开 cookie.txt

with open("cookies.txt", "r") as file:

cookie = file.read()

cookies = json.loads(cookie)

url = "https://mp.weixin.qq.com"

#请求公号平台

response = requests.get(url, cookies=cookies)

# 从url中获取token

token = re.findall(r'token=(d+)', str(response.url))[0]

# 设置请求访问头信息

headers = {

"Referer": "https://mp.weixin.qq.com/cgi-bin/appmsg?t=media/appmsg_edit_v2&action=edit&isNew=1&type=10&token=" + token + "&lang=zh_CN",

"Host": "mp.weixin.qq.com",

"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36",

}

path_wk = r'E:迅雷下载Compressedwkhtmltox-0.12.5-1.mxe-cross-win64wkhtmltoxbinwkhtmltoimage.exe' #安装位置

config = pdfkit.configuration(wkhtmltopdf = path_wk)

#pdfkit.from_url(url, r'D:are you codingpdf aobao.pdf', configuration=config)

# 循环遍历前10页的文章，每页有5篇

for j in range(1, 10, 1):

begin = (j-1)*5

# 请求当前页获取文章列表

requestUrl = "https://mp.weixin.qq.com/cgi-bin/appmsg?action=list_ex&begin="+str(begin)+"&count=5&fakeid=填刚刚的fakeid&type=9&query=&token=" + token + "&lang=zh_CN&f=json&ajax=1"

search_response = requests.get(requestUrl, cookies=cookies, headers=headers)

# 获取到返回列表 Json 信息

re_text = search_response.json()

list = re_text.get("app_msg_list")

# 遍历当前页的文章列表

for i in list:

# 将文章链接转换 pdf 下载到当前目录

print(i["link"], i["title"])

text = i["link"] + "<=====>" + i["title"]

with open('article_link.txt','wb') as f:

f.write(text)

# pdfkit.from_url(i["link"], i["title"] + ".pdf", configuration=config)

# 过快请求可能会被微信问候，这里进行10秒等待

time.sleep(10)

这段代码是直接粘过来的，稍微做了点改动，将 URL 转成 PDF 时使用的是 pdfkit 的模块，使用这个需要先安装 wkhtmltopdf，官网https://wkhtmltopdf.org/downloads.html直接下载相应版本，由于我下载的便携版，所以写入pdf的时候增加了configuration=config

这样执行下来会有个问题，微信公众号的图片无法写入pdf文件中，于是我就先把每篇文章的链接保存下来，防止每次运行都爬一次，被ban。

不显示图片，无奈了，很多图片还是很重要的，毕竟是为了学习[狗头]

搜索一下，我发现可以用的解决方法有两个，一是使用wechatsogou(基于搜狗微信搜索的微信公众号爬虫接口)，使用这个模块中一个接口来处理

# 该方法根据文章url对html进行处理，使图片显示

content_info = ws_api.get_article_content(url)

# 得到html代码(代码不完整，需要加入head、body等标签)

html_code = content_info['content_html']

感觉用这个的话还得重新爬一边，就没多研究这个，收藏以后研究

来自这个网站

https://www.lagou.com/lgeduarticle/37166.html

还有一种方法就是把所有的图片下载到本地，然后用wkhtmltopdf生成pdf，就没有问题了，本着拿来即用的态度，找到一篇

来自这个博客

https://blog.csdn.net/weixin_41267342/article/details/96729138

稍作改动，配合自己之前生成的链接文本

# -*- coding:utf-8 -*-

import json

import re

import time

from bs4 import BeautifulSoup

import requests

import os

# 保存页面到本地

def save_html(url_content,htmlDir,file_name):

f = open(htmlDir+"\"+file_name+'.html', 'wb')

f.write(url_content.content) # save to page.html

f.close()

return url_content

# 修改文件,将图片路径改为本地的路径

def update_file(old, new,htmlDir):

with open(htmlDir+"\"+file_name+'.html', encoding='utf-8') as f, open(htmlDir+"\"+file_name+'_bak.html', 'w',

encoding='utf-8') as fw: # 打开两个文件，原始文件用来读，另一个文件将修改的内容写入

for line in f: # 遍历每行，取出来的是字符串，因此可以用replace 方法替换

new_line = line.replace(old, new) # 逐行替换

new_line = new_line.replace("data-src", "src")

fw.write(new_line) # 写入新文件

os.remove(htmlDir+"\"+file_name+'.html') # 删除原始文件

time.sleep(10)

os.rename(htmlDir+"\"+file_name+'_bak.html', htmlDir+"\"+file_name+'.html') # 修改新文件名， old -> new

print('当前保存文件为：'+file_name+'.html')

# 保存图片到本地

def save_file_to_local(htmlDir,targetDir,search_response,domain):

obj = BeautifulSoup(save_html(search_response,htmlDir,file_name).content, 'lxml') # 后面是指定使用lxml解析，lxml解析速度比较快，容错高。

imgs = obj.find_all('img')

# 将页面上图片的链接加入list

urls = []

for img in imgs:

if 'data-src' in str(img):

urls.append(img['data-src'])

elif 'src=""' in str(img):

pass

elif "src" not in str(img):

pass

else:

urls.append(img['src'])

# 遍历所有图片链接，将图片保存到本地指定文件夹，图片名字用0，1，2...

i = 0

for each_url in urls: # 看下文章的图片有哪些格式，一一处理

if each_url.startswith('//'):

new_url = 'https:' + each_url

r_pic = requests.get(new_url)

elif each_url.startswith('/') and each_url.endswith('gif'):

new_url = domain + each_url

r_pic = requests.get(new_url)

elif each_url.endswith('png') or each_url.endswith('jpg') or each_url.endswith('gif') or each_url.endswith('jpeg'):

r_pic = requests.get(each_url)

t = os.path.join(targetDir, str(i) + '.jpeg') # 指定目录

print('当前保存图片为：' + t)

fw = open(t, 'wb') # 指定绝对路径

fw.write(r_pic.content) # 保存图片到本地指定目录

i += 1

update_file(each_url, t, htmlDir) # 将老的链接(有可能是相对链接)修改为本地的链接，这样本地打开整个html就能访问图片

fw.close()

#下载html页面和图片

def save(search_response,file_name):

htmlDir = os.path.join(os.path.dirname(os.path.abspath(__file__)), file_name)

targetDir = os.path.join(os.path.dirname(os.path.abspath(__file__)),file_name+'imgs1') # 图片保存的路径，eg,向前文件夹为'D:Coding', 即图片保存在'D:Codingimgs1'

if not os.path.isdir(targetDir): # 不存在创建路径

os.makedirs(targetDir)

domain = 'https://mp.weixin.qq.com/s'

save_html(search_response, htmlDir,file_name)

save_file_to_local(htmlDir, targetDir, search_response, domain)

# 获得登录所需cookies

with open("cookies.txt", "r") as file:

cookie = file.read()

cookies = json.loads(cookie)

url = "https://mp.weixin.qq.com"

response = requests.get(url, cookies=cookies)

token = re.findall(r'token=(d+)', str(response.url))[0]

print(token)

headers = {

"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36",

"Referer": "https://mp.weixin.qq.com/cgi-bin/appmsg?t=media/appmsg_edit_v2&action=edit&isNew=1&type=10&token="+token+"&lang=zh_CN",

"Host": "mp.weixin.qq.com",

}

f = open("article_link.txt", encoding='utf-8') # 返回一个文件对象

line = f.readline() # 调用文件的 readline()方法

for line in open("article_link.txt", encoding='UTF-8'):

new_line = line.strip()

line_list = new_line.split("<=====>")

file_name = line_list[1]

#dir_name = line_list[1]

requestUrl = line_list[0]

search_response = requests.get(requestUrl, cookies=cookies, headers=headers)

save(search_response, file_name)

print(file_name+"----------------下载完毕："+requestUrl)

time.sleep(2)

file.close()

通过这个程序，就把所有的链接的图片下载到当前目录，并把html里的图片路径转换成了本地「绝对」路径，为什么绝对要加引号呢，因为绝对路径，意味着移动一下就不能用了，尴尬。不过还好不影响我现在用wkhtmltopdf生成pdf文档，python也安装了pdfkit，不过我突然有种错觉，似乎写个bat更简单，果然，学习一下bat，几行就搞定了，这么多字了，这部分下篇再写。

还有替换绝对路径，也放在下篇吧，简单的正则替换。

回到刚刚下载html，真正执行的时候，还有个很大的问题，太慢了，单线程的，睡了一觉，玩了半天才下载了不到40篇，想到我之前也学习过多线程的脚本，但是记忆有点模糊，重新拾起似乎还要复习一会，而且也容易被ban吧，毕竟用的是公众号的cookie，不太安全，遂采用手工多线程，开四个程序慢慢跑-----就是把链接文本拆分一下，保存成4个文档，再稍微改一下主程序，开四个窗口跑[再次狗头]，后面再优化一下这个脚本。

以上就是本篇文章【python公众号文章_Python 抓取微信公众号文章】的全部内容了，欢迎阅览！文章地址：http://www78564.xrbh.cn/quote/27446.html
动态相关文章文章同类文章热门文章栏目首页网站地图返回首页迅博思语移动站 http://www78564.xrbh.cn/mobile/ , 查看更多