微信公众号内容采集,比较怪异,其参数,post参数需要话费时间去搞定
优采云 发布时间: 2021-07-20 07:05微信公众号内容采集,比较怪异,其参数,post参数需要话费时间去搞定
微信公众号采集的内容很奇怪。它的参数和后期参数需要时间来弄清楚。这里采集是topic标签的内容,用pdfkit打印出来的内容。
这里实现了两个版本。第一个是直接网络访问。它的真实地址,post URL,也有更多的参数。我没试过。得到的内容只是部分的,并不理想。第二个版本是使用无头浏览器直接访问,获取网页源代码,分析,获取你想要的内容。
这个人渣现在比较懒,代码都是以前用的,现成的,复制的,修改的,直接用!
版本一:
#微信公众号内容获取打印pdf
#by 微信:huguo00289
#https://mp.weixin.qq.com/mp/homepage?__biz=MzA4NjQ3MDk4OA==&hid=5&sn=573b1b806f9ebf63171a56ee2936b883&devicetype=android-29&version=27001239&lang=zh_CN&nettype=WIFI&a=&session_us=gh_7d55ab2d943f&wx_header=1&fontScale=100&from=timeline&isappinstalled=0&scene=1&subscene=2&clicktime=1594602258&enterid=1594602258&ascene=14
# -*- coding: UTF-8 -*-
import requests
from fake_useragent import UserAgent
import os,re
import pdfkit
confg = pdfkit.configuration(
wkhtmltopdf=r'D:\wkhtmltox-0.12.5-1.mxe-cross-win64\wkhtmltox\bin\wkhtmltopdf.exe')
class Du():
def __init__(self,furl):
ua=UserAgent()
self.headers={
"User-Agent": ua.random,
}
self.url=furl
def get_urls(self):
response=requests.get(self.url,headers=self.headers,timeout=8)
html=response.content.decode('utf-8')
req=re.findall(r'var data={(.+?)if',html,re.S)[0]
urls=re.findall(r',"link":"(.+?)",',req,re.S)
urls=set(urls)
print(len(urls))
return urls
def get_content(self,url,category):
response = requests.get(url, headers=self.headers, timeout=8)
print(response.status_code)
html = response.content.decode('utf-8')
req = re.findall(r'(.+?)var first_sceen__time',html,re.S)[0]
#获取标题
h1=re.findall(r'(.+?)',req,re.S)[0]
h1=h1.strip()
pattern = r"[\/\\\:\*\?\"\\|]"
h1 = re.sub(pattern, "_", h1) # 替换为下划线
print(h1)
#获取详情
detail = re.findall(r'(.+?)',req,re.S)[0]
data = f'{h1}\n{detail}'
self.dypdf(h1,data,category)
return data
def dypdf(self,h1,data,category):
datas = f'{data}'
print("开始打印内容!")
pdfkit.from_string(datas, f'{category}/{h1}.pdf', configuration=confg)
print("打印保存成功!")
if __name__=='__main__':
furl="https://mp.weixin.qq.com/mp/homepage?__biz=MzA4NjQ3MDk4OA==&hid=5&sn=573b1b806f9ebf63171a56ee2936b883&devicetype=android-29&version=27001239&lang=zh_CN&nettype=WIFI&a=&session_us=gh_7d55ab2d943f&wx_header=1&fontScale=100&from=timeline&isappinstalled=0&scene=1&subscene=2&clicktime=1594602258&enterid=1594602258&ascene=14"
category="潘通色卡(电子版)"
datas = ''
os.makedirs(f'{category}/',exist_ok=True)
spider=Du(furl)
urls=spider.get_urls()
for url in urls:
print(f">> 正在爬取链接:{url} ..")
try:
data=spider.get_content(url,category)
except Exception as e:
print(f"爬取错误,错误代码为:{e}")
datas='%s%s%s'%(datas,'\n',data)
spider.dypdf(category,datas,category)
版本二:
#微信公众号内容获取打印pdf
#by 微信:huguo00289
#https://mp.weixin.qq.com/mp/homepage?__biz=MzA4NjQ3MDk4OA==&hid=5&sn=573b1b806f9ebf63171a56ee2936b883&devicetype=android-29&version=27001239&lang=zh_CN&nettype=WIFI&a=&session_us=gh_7d55ab2d943f&wx_header=1&fontScale=100&from=timeline&isappinstalled=0&scene=1&subscene=2&clicktime=1594602258&enterid=1594602258&ascene=14
# -*- coding: UTF-8 -*-
import requests
from selenium import webdriver
import os,re,time
import pdfkit
from bs4 import BeautifulSoup
confg = pdfkit.configuration(
wkhtmltopdf=r'D:\wkhtmltox-0.12.5-1.mxe-cross-win64\wkhtmltox\bin\wkhtmltopdf.exe')
class wx():
def __init__(self,furl):
self.url = furl
self.chrome_driver = r'C:\Users\Administrator\Desktop\chromedriver_win32\chromedriver.exe' # chromedriver的文件位置
self.browser = webdriver.Chrome(executable_path=self.chrome_driver)
def get_urls(self):
urls=[]
self.browser.get(self.url)
hrefs=self.browser.find_elements_by_xpath("//div[@class='article_list']/a[@class='list_item js_post']")
for href in hrefs:
url=href.get_attribute('href')
urls.append(url)
print(len(urls))
return urls
def get_content(self,url,category):
self.browser.get(url)
time.sleep(5)
# 调用driver的page_source属性获取页面源码
pageSource = self.browser.page_source
soup=BeautifulSoup(pageSource,'lxml')
#获取标题
h1=re.findall(r'(.+?)',pageSource,re.S)[0]
h1=h1.strip()
pattern = r"[\/\\\:\*\?\"\\|]"
h1 = re.sub(pattern, "_", h1) # 替换为下划线
print(h1)
#获取详情
detail =soup.find('div',class_="rich_media_content")
detail=str(detail)
del_text="""↑ 点击上方“染整百科”关注我们</strong>"""
detail=detail.replace(del_text,'')
data = f'{h1}\n{detail}'
self.dypdf(h1,data,category)
return data
def dypdf(self,h1,data,category):
datas = f'{data}'
print("开始打印内容!")
pdfkit.from_string(datas, f'{category}/{h1}.pdf', configuration=confg)
print("打印保存成功!")
def quit(self):
self.browser.quit()
if __name__=='__main__':
furl="https://mp.weixin.qq.com/mp/homepage?__biz=MzA4NjQ3MDk4OA==&hid=5&sn=573b1b806f9ebf63171a56ee2936b883&devicetype=android-29&version=27001239&lang=zh_CN&nettype=WIFI&a=&session_us=gh_7d55ab2d943f&wx_header=1&fontScale=100&from=timeline&isappinstalled=0&scene=1&subscene=2&clicktime=1594602258&enterid=1594602258&ascene=14"
category="潘通色卡(电子版)"
datas = ''
os.makedirs(f'{category}/',exist_ok=True)
spider=wx(furl)
urls=spider.get_urls()
for url in urls:
print(f">> 正在爬取链接:{url} ..")
try:
data=spider.get_content(url,category)
except Exception as e:
print(f"爬取错误,错误代码为:{e}")
datas='%s%s%s'%(datas,'\n',data)
spider.quit()
spider.dypdf(category,datas,category)
</p>
以上代码仅供参考,如有雷同,必被此人渣抄袭!