微信公众号内容采集，比较怪异，其参数，post参数需要话费时间去搞定

优采云发布时间: 2021-07-20 07:05

　　微信公众号采集的内容很奇怪。它的参数和后期参数需要时间来弄清楚。这里采集是topic标签的内容，用pdfkit打印出来的内容。

　　这里实现了两个版本。第一个是直接网络访问。它的真实地址，post URL，也有更多的参数。我没试过。得到的内容只是部分的，并不理想。第二个版本是使用无头浏览器直接访问，获取网页源代码，分析，获取你想要的内容。

　　这个人渣现在比较懒，代码都是以前用的，现成的，复制的，修改的，直接用！

　　版本一：

　　#微信公众号内容获取打印pdf

#by 微信：huguo00289

#https://mp.weixin.qq.com/mp/homepage?__biz=MzA4NjQ3MDk4OA==&hid=5&sn=573b1b806f9ebf63171a56ee2936b883&devicetype=android-29&version=27001239&lang=zh_CN&nettype=WIFI&a=&session_us=gh_7d55ab2d943f&wx_header=1&fontScale=100&from=timeline&isappinstalled=0&scene=1&subscene=2&clicktime=1594602258&enterid=1594602258&ascene=14

# -*- coding: UTF-8 -*-

import requests

from fake_useragent import UserAgent

import os,re

import pdfkit

confg = pdfkit.configuration(

wkhtmltopdf=r'D:\wkhtmltox-0.12.5-1.mxe-cross-win64\wkhtmltox\bin\wkhtmltopdf.exe')

class Du():

def __init__(self,furl):

ua=UserAgent()

self.headers={

"User-Agent": ua.random,

}

self.url=furl

def get_urls(self):

response=requests.get(self.url,headers=self.headers,timeout=8)

html=response.content.decode('utf-8')

req=re.findall(r'var data={(.+?)if',html,re.S)[0]

urls=re.findall(r',"link":"(.+?)",',req,re.S)

urls=set(urls)

print(len(urls))

return urls

def get_content(self,url,category):

response = requests.get(url, headers=self.headers, timeout=8)

print(response.status_code)

html = response.content.decode('utf-8')

req = re.findall(r'(.+?)var first_sceen__time',html,re.S)[0]

#获取标题

h1=re.findall(r'(.+?)',req,re.S)[0]

h1=h1.strip()

pattern = r"[\/\\:\*\?\"\|]"

h1 = re.sub(pattern, "_", h1) # 替换为下划线

print(h1)

#获取详情

detail = re.findall(r'(.+?)',req,re.S)[0]

data = f'{h1}\n{detail}'

self.dypdf(h1,data,category)

return data

def dypdf(self,h1,data,category):

datas = f'{data}'

print("开始打印内容！")

pdfkit.from_string(datas, f'{category}/{h1}.pdf', configuration=confg)

print("打印保存成功！")

if __name__=='__main__':

furl="https://mp.weixin.qq.com/mp/homepage?__biz=MzA4NjQ3MDk4OA==&hid=5&sn=573b1b806f9ebf63171a56ee2936b883&devicetype=android-29&version=27001239&lang=zh_CN&nettype=WIFI&a=&session_us=gh_7d55ab2d943f&wx_header=1&fontScale=100&from=timeline&isappinstalled=0&scene=1&subscene=2&clicktime=1594602258&enterid=1594602258&ascene=14"

category="潘通色卡（电子版）"

datas = ''

os.makedirs(f'{category}/',exist_ok=True)

spider=Du(furl)

urls=spider.get_urls()

for url in urls:

print(f">> 正在爬取链接：{url} ..")

try:

data=spider.get_content(url,category)

except Exception as e:

print(f"爬取错误，错误代码为:{e}")

datas='%s%s%s'%(datas,'\n',data)

spider.dypdf(category,datas,category)

　　版本二：

　　#微信公众号内容获取打印pdf

#by 微信：huguo00289

#https://mp.weixin.qq.com/mp/homepage?__biz=MzA4NjQ3MDk4OA==&hid=5&sn=573b1b806f9ebf63171a56ee2936b883&devicetype=android-29&version=27001239&lang=zh_CN&nettype=WIFI&a=&session_us=gh_7d55ab2d943f&wx_header=1&fontScale=100&from=timeline&isappinstalled=0&scene=1&subscene=2&clicktime=1594602258&enterid=1594602258&ascene=14

# -*- coding: UTF-8 -*-

import requests

from selenium import webdriver

import os,re,time

import pdfkit

from bs4 import BeautifulSoup

confg = pdfkit.configuration(

wkhtmltopdf=r'D:\wkhtmltox-0.12.5-1.mxe-cross-win64\wkhtmltox\bin\wkhtmltopdf.exe')

class wx():

def __init__(self,furl):

self.url = furl

self.chrome_driver = r'C:\Users\Administrator\Desktop\chromedriver_win32\chromedriver.exe' # chromedriver的文件位置

self.browser = webdriver.Chrome(executable_path=self.chrome_driver)

def get_urls(self):

urls=[]

self.browser.get(self.url)

hrefs=self.browser.find_elements_by_xpath("//div[@class='article_list']/a[@class='list_item js_post']")

for href in hrefs:

url=href.get_attribute('href')

urls.append(url)

print(len(urls))

return urls

def get_content(self,url,category):

self.browser.get(url)

time.sleep(5)

# 调用driver的page_source属性获取页面源码

pageSource = self.browser.page_source

soup=BeautifulSoup(pageSource,'lxml')

#获取标题

h1=re.findall(r'(.+?)',pageSource,re.S)[0]

h1=h1.strip()

pattern = r"[\/\\:\*\?\"\|]"

h1 = re.sub(pattern, "_", h1) # 替换为下划线

print(h1)

#获取详情

detail =soup.find('div',class_="rich_media_content")

detail=str(detail)

del_text="""↑ 点击上方“染整百科”关注我们</strong>"""

detail=detail.replace(del_text,'')

data = f'{h1}\n{detail}'

self.dypdf(h1,data,category)

return data

def dypdf(self,h1,data,category):

datas = f'{data}'

print("开始打印内容！")

pdfkit.from_string(datas, f'{category}/{h1}.pdf', configuration=confg)

print("打印保存成功！")

def quit(self):

self.browser.quit()

if __name__=='__main__':

furl="https://mp.weixin.qq.com/mp/homepage?__biz=MzA4NjQ3MDk4OA==&hid=5&sn=573b1b806f9ebf63171a56ee2936b883&devicetype=android-29&version=27001239&lang=zh_CN&nettype=WIFI&a=&session_us=gh_7d55ab2d943f&wx_header=1&fontScale=100&from=timeline&isappinstalled=0&scene=1&subscene=2&clicktime=1594602258&enterid=1594602258&ascene=14"

category="潘通色卡（电子版）"

datas = ''

os.makedirs(f'{category}/',exist_ok=True)

spider=wx(furl)

urls=spider.get_urls()

for url in urls:

print(f">> 正在爬取链接：{url} ..")

try:

data=spider.get_content(url,category)

except Exception as e:

print(f"爬取错误，错误代码为:{e}")

datas='%s%s%s'%(datas,'\n',data)

spider.quit()

spider.dypdf(category,datas,category)

</p>

　　以上代码仅供参考，如有雷同，必被此人渣抄袭！

0

2021-07-20

文章采集调用

0 个评论

要回复文章请先登录或注册

AI时代内容工厂

微信公众号内容采集，比较怪异，其参数，post参数需要话费时间去搞定

0 个评论

发起人

AI时代内容工厂

微信公众号内容采集，比较怪异，其参数，post参数需要话费时间去搞定

0 个评论

发起人

相关问题