今日头条文章采集软件(目录私信小编01即可获取大量Python学习资料需要的Python模块实现思路)

优采云 发布时间: 2022-04-03 04:10

  今日头条文章采集软件(目录私信小编01即可获取大量Python学习资料需要的Python模块实现思路)

  最近在今日头条文章的数据抓取过程中,发现视频地址的获取比较复杂。对应的解决思路是在源码和浏览器的配合下找到的,所以记录一下。

  

  内容

  私信小编01可以获得大量Python学习资料

  所需Python模块实现思路代码及运行结果文本

  1.必需的 Python 模块

   模块主要有requests(或者aiohttp),PyExecJS。 前者是请求文章的源码,后者是Python执行JS代码的依赖库,主要是生成视频地址12

  实现思路一. 主要要求是将原文章中的视频和图片地址替换为本地存储地址,所以需要下载资源,视频时通过抓包找到对应的视频地址分析。在源码和相关接口响应中没有找到对应的视频地址参数。

  通过文章源码(HTML)浏览器渲染,发现视频标签是后面生成的,视频地址也存在,那么这个标签肯定是JS生成的,找到key所在的标签脚本JS通过搜索定位

  二. 分析地址对应的js,发现有生成视频标签的方法,推断有生成视频地址的方法,如下:

  到这里就很清楚我们要的视频地址是从哪里来的了。这是方法:

  分析方法,发现有一个关键参数t。另外,在图2中,我们找到了方法e,填充的参数v,这让我想起了之前抓包中某个接口返回的结果对应的main_url var u = o。data.video_list, h = u.video_1, v = h.main_url, 123三. 接口为:

  接口返回结果:

  同时该接口中的参数(v0201f800000bub4vq2vtt9a5oknnlp0)可以在源码中找到,可以使用正则模式进行匹配。

  可以大胆尝试,在生成视频地址的方法中添加main_url值,需要在JS底部添加参数:var c = new Array( - 1, -1, -1, -1, -1、-1、-1、-1、-1、-1、-1、-1、-1、-1、-1、-1、-1、-1、-1、-1、-1 , -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - 1, -1, -1, -1, -1, -1, 62, -1, -1, -1, 63, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, -1, -1, -1, -1, -1, -1, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 , 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1, -1, 26, 27, 28, 29, 30 , 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, -1, -1, - 1, -1, -1); 1

  我用的是JS调试工具(调试方便,检查代码语法),其他方法也可以

  结果是:

  %3D%3D&vl=&vr=

  地址是视频地址,所以证明上面的猜想是正确的,但是地址参数是时间敏感的,所以需要动态改变。您可以自己测试再生。

  代码和运行结果(我用的是另一种方式)

<p>async def get_page_source(url): browser = None page = None try: browser = await launch( headless=True, ignoreHTTPSErrors=True, handleSIGINT=False, handleSIGTERM=False, handleSIGHUP=False, defaultViewport=None, args=[&#39;--disable-setuid-sandbox&#39;, &#39;--no-sandbox&#39;, &#39;--ignore-certificate-errors&#39;, &#39;--disable-gpu&#39;, &#39;--disable-gpu-sandbox&#39;, &#39;--start-maximized&#39; ] ) pages = await browser.pages() page = pages[0] # 是否启用JS,enabled设为False,则无渲染效果 await page.setJavaScriptEnabled(enabled=True) await page.setViewport(viewport={&#39;width&#39;: 1200, &#39;height&#39;: 800}) await page.evaluateOnNewDocument( &#39;() =>{ Object.defineProperties(navigator,{ webdriver:{ get: () => false } }) }&#39;) await page.evaluateOnNewDocument("() =>{ Object.defineProperty(navigator, &#39;plugins&#39;, { get: () => [] }) }") await page.evaluateOnNewDocument( "() =>{ Object.defineProperty(navigator, &#39;languages&#39;, { get: () => [&#39;zh-CN&#39;,&#39;zh] }) }") await page.setUserAgent( &#39;Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36&#39;) await page.goto(url, {&#39;timeout&#39;: 5000, &#39;waitUntil&#39;: &#39;load&#39;}) page_source = await page.content() return page_source except Exception as e: # app_logger.error(&#39;账号:%s, 登录错误:%s&#39; % (username, e)) print(e) return -1 finally: if page is not None: # await page.waitFor(1000) await page.close() if browser is not None: await browser.close()async def get_data(url, continue_number=0): """解析文章源码,提取视频,文字,图片等信息""" try: page_source = await get_page_source(url) # 视频处理,及视频封面 video_message_id_ = re.findall(&#39;tt-videoid="(.*?)"&#39;, page_source) video_cover_ = re.findall(&#39;tt-poster="(.*?)"&#39;, page_source) if len(video_message_id_) > 0 and len(video_cover_) > 0: video_message_id = video_message_id_[0] video_url = await get_video_url_id(video_message_id, url) video_cover = await download_video_cover(video_cover_[0], url) except Exception as e: if continue_number < continue_num: print(e) # app_logger.error(&#39;function get_data error: %s&#39; % e) continue_number += 1 video_address = await get_data(url, continue_number) return video_address else: # app_logger.error(&#39;function get_data : %s exceed maximum retry&#39; % url) return -1async def get_video_url_id(video_id, article_url, continue_number=0): """解析视频main_url""" header = {&#39;User-Agent&#39;: &#39;Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) &#39; &#39;Chrome/83.0.4103.116 Safari/537.36&#39;} data_url = &#39;https://i.snssdk.com/video/urls/1/toutiao/mp4/{}&#39;.format(video_id) try: async with aiohttp.ClientSession(connector=TCPConnector(verify_ssl=False), timeout=timeout) as session: async with session.get(data_url, headers=header) as resp: response = await resp.json() if response[&#39;message&#39;].strip() == "success": data = response[&#39;data&#39;][&#39;video_list&#39;] keys = data.keys() if &#39;video_3&#39; in keys: main_url = data[&#39;video_3&#39;][&#39;main_url&#39;] video_url = await get_video_url(main_url) video_url_oss = await download_video(video_url, article_url) return video_url_oss elif &#39;video_3&#39; not in keys and &#39;video_2&#39; in keys: main_url = data[&#39;video_3&#39;][&#39;main_url&#39;] video_url = await get_video_url(main_url) video_url_oss = await download_video(video_url, article_url) return video_url_oss else: main_url = data[&#39;video_3&#39;][&#39;main_url&#39;] video_url = await get_video_url(main_url) video_url_oss = await download_video(video_url, article_url) return video_url_oss except Exception as e: if continue_number < continue_num: print(e) # app_logger.error(&#39;function get_data error: %s&#39; % e) continue_number += 1 video_address = await get_data(url, continue_number) return video_address else: # app_logger.error(&#39;function get_data : %s exceed maximum retry&#39; % url) return -1async def get_video_url(main_url, continue_number=0): """获取视频地址,js执行""" try: tt = """var c = new Array( - 1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 62, -1, -1, -1, 63, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, -1, -1, -1, -1, -1, -1, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1, -1, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, -1, -1, -1, -1, -1); function e(t) { var e, o, i, r, n, a, s; for (a = t.length, n = 0, s = ""; a > n;) { do e = c[255 & t.charCodeAt(n++)]; while (a > n && -1 == e); if ( - 1 == e) break; do o = c[255 & t.charCodeAt(n++)]; while (a > n && -1 == o); if ( - 1 == o) break; s += String.fromCharCode(e > 4); do { if (i = 255 & t.charCodeAt(n++), 61 == i) return s; i = c[i] } while ( a > n && - 1 == i ); if ( - 1 == i) break; s += String.fromCharCode((15 & o) > 2); do { if (r = 255 & t.charCodeAt(n++), 61 == r) return s; r = c[r] } while ( a > n && - 1 == r ); if ( - 1 == r) break; s += String.fromCharCode((3 & i)

0 个评论

要回复文章请先登录注册


官方客服QQ群

微信人工客服

QQ人工客服


线