抓取网页音频(如何在手机App中的数据中抓取数据?(图) )
优采云 发布时间: 2021-11-05 03:05抓取网页音频(如何在手机App中的数据中抓取数据?(图)
)
过去,数据是在网络上捕获的。移动应用程序中捕获的数据并不多。如何捕获移动应用程序中的数据?通常我们使用抓包工具抓取data.html
常用的抓包工具由 Fiddles 和 Charles 等人编写。今天主要讲一下Charles的使用。与 Fiddles 相比,Charles 更强大,更易于使用。所以我一般推荐Charlesios抓包。
下载并安装 Charles
下载安装Charles然后破解Charles,这里是文章教程,我很少说
json
注意事项:应用
如果获取的数据是乱码,需要在Charles的菜单栏中设置链接SSL证书==>proxy==>SSL代理设置==>添加443,如上图所示。那么当你在实际抓取数据的时候,记得把这个关掉,以免丢失数据的工具
使用查尔斯
这里我直接放两张图给大家用看看就明白了
我们一起来分析一下这个项目。
# 这里有点递归的意味<br style="margin:0px;padding:0px;max-width:100%;"> max_id = datas[-1]['publish_time_stamp']<br style="margin:0px;padding:0px;max-width:100%;"> if self.max_id != max_id:<br style="margin:0px;padding:0px;max-width:100%;"> self.max_id = max_id<br style="margin:0px;padding:0px;max-width:100%;"> self.request_data()<br style="margin:0px;padding:0px;max-width:100%;"> else:<br style="margin:0px;padding:0px;max-width:100%;"> print('数据抓取完毕!')<br style="margin:0px;padding:0px;max-width:100%;">
完整代码:
import requests<br style="margin:0px;padding:0px;max-width:100%;"><br style="margin:0px;padding:0px;max-width:100%;">import time<br style="margin:0px;padding:0px;max-width:100%;">import json<br style="margin:0px;padding:0px;max-width:100%;">from dedao.ExeclUtils import ExeclUtils<br style="margin:0px;padding:0px;max-width:100%;">import os<br style="margin:0px;padding:0px;max-width:100%;"><br style="margin:0px;padding:0px;max-width:100%;"><br style="margin:0px;padding:0px;max-width:100%;">class dedao(object):<br style="margin:0px;padding:0px;max-width:100%;"><br style="margin:0px;padding:0px;max-width:100%;"> def __init__(self):<br style="margin:0px;padding:0px;max-width:100%;"> # self.rows_title = [u'招聘标题', u'公司名称', u'公司地址', u'待遇', u'发布日期', u'招聘连接', u'招聘要求描述']<br style="margin:0px;padding:0px;max-width:100%;"> # sheet_name = u'51job_Python招聘'<br style="margin:0px;padding:0px;max-width:100%;"> self.rows_title = [u'来源目录', u'标题', u'图片', u'分享标题', u'mp3地址', u'音频时长', u'文件大小']<br style="margin:0px;padding:0px;max-width:100%;"> sheet_name = u'逻辑思惟音频'<br style="margin:0px;padding:0px;max-width:100%;"><br style="margin:0px;padding:0px;max-width:100%;"> return_execl = ExeclUtils.create_execl(sheet_name, self.rows_title)<br style="margin:0px;padding:0px;max-width:100%;"> self.execl_f = return_execl[0]<br style="margin:0px;padding:0px;max-width:100%;"> self.sheet_table = return_execl[1]<br style="margin:0px;padding:0px;max-width:100%;"> self.audio_info = [] # 存放每一条数据中的各元素,<br style="margin:0px;padding:0px;max-width:100%;"> self.count = 0 # 数据插入从1开始的<br style="margin:0px;padding:0px;max-width:100%;"> self.base_url = 'https://entree.igetget.com/acropolis/v1/audio/listall'<br style="margin:0px;padding:0px;max-width:100%;"> self.max_id = 0<br style="margin:0px;padding:0px;max-width:100%;"> self.headers = {<br style="margin:0px;padding:0px;max-width:100%;"> 'Host': 'entree.igetget.com',<br style="margin:0px;padding:0px;max-width:100%;"> 'X-OS': 'iOS',<br style="margin:0px;padding:0px;max-width:100%;"> 'X-NET': 'wifi',<br style="margin:0px;padding:0px;max-width:100%;"> 'Accept': '*/*',<br style="margin:0px;padding:0px;max-width:100%;"> 'X-Nonce': '779b79d1d51d43fa',<br style="margin:0px;padding:0px;max-width:100%;"> 'Accept-Encoding': 'br, gzip, deflate',<br style="margin:0px;padding:0px;max-width:100%;"> # 'Content-Length': ' 67',<br style="margin:0px;padding:0px;max-width:100%;"> 'X-TARGET': 'main',<br style="margin:0px;padding:0px;max-width:100%;"> 'User-Agent': '%E5%BE%97%E5%88%B0/4.0.13 CFNetwork/901.1 Darwin/17.6.0',<br style="margin:0px;padding:0px;max-width:100%;"> 'X-CHIL': 'appstore',<br style="margin:0px;padding:0px;max-width:100%;"> 'Cookie ': 'acw_tc=AQAAAC0YfiuHegUAxkvoZRLraUMQyRfH; aliyungf_tc=AQAAAKwCD1dINAUAxkvoZTppW+jezS/9',<br style="margin:0px;padding:0px;max-width:100%;"> 'X-UID': '34556154',<br style="margin:0px;padding:0px;max-width:100%;"> 'X-AV ': '4.0.0',<br style="margin:0px;padding:0px;max-width:100%;"> 'X-SEID ': '',<br style="margin:0px;padding:0px;max-width:100%;"> 'X-SCR ': '1242*2208',<br style="margin:0px;padding:0px;max-width:100%;"> 'X-DT': 'phone',<br style="margin:0px;padding:0px;max-width:100%;"> 'X-S': '91a46b7a31ffc7a2',<br style="margin:0px;padding:0px;max-width:100%;"> 'X-Sign': 'ZTBiZjQyNTI1OTU2MTgwZjYwMWRhMjc5ZjhmMGRlNGI=',<br style="margin:0px;padding:0px;max-width:100%;"> 'Accept-Language': 'zh-cn',<br style="margin:0px;padding:0px;max-width:100%;"> 'X-D': 'ca3c83fca6e84a2d869f95829964ebb8',<br style="margin:0px;padding:0px;max-width:100%;"> 'X-THUMB': 'l',<br style="margin:0px;padding:0px;max-width:100%;"> 'X-T': 'json',<br style="margin:0px;padding:0px;max-width:100%;"> 'X-Timestamp': '1528195376',<br style="margin:0px;padding:0px;max-width:100%;"> 'X-TS': '1528195376',<br style="margin:0px;padding:0px;max-width:100%;"> 'X-U': '34556154',<br style="margin:0px;padding:0px;max-width:100%;"> 'X-App-Key': 'ios-4.0.0',<br style="margin:0px;padding:0px;max-width:100%;"> 'X-OV': '11.4',<br style="margin:0px;padding:0px;max-width:100%;"> 'Connection': 'keep-alive',<br style="margin:0px;padding:0px;max-width:100%;"> 'X-ADV': '1',<br style="margin:0px;padding:0px;max-width:100%;"> 'Content-Type': 'application/x-www-form-urlencoded',<br style="margin:0px;padding:0px;max-width:100%;"> 'X-V': '2',<br style="margin:0px;padding:0px;max-width:100%;"> 'X-IS_JAILBREAK ': 'NO',<br style="margin:0px;padding:0px;max-width:100%;"> 'X-DV': 'iPhone10,2',<br style="margin:0px;padding:0px;max-width:100%;"> }<br style="margin:0px;padding:0px;max-width:100%;"><br style="margin:0px;padding:0px;max-width:100%;"> def request_data(self):<br style="margin:0px;padding:0px;max-width:100%;"> try:<br style="margin:0px;padding:0px;max-width:100%;"> data = {<br style="margin:0px;padding:0px;max-width:100%;"> 'max_id': self.max_id,<br style="margin:0px;padding:0px;max-width:100%;"> 'since_id': 0,<br style="margin:0px;padding:0px;max-width:100%;"> 'column_id': 2,<br style="margin:0px;padding:0px;max-width:100%;"> 'count': 20,<br style="margin:0px;padding:0px;max-width:100%;"> 'order': 1,<br style="margin:0px;padding:0px;max-width:100%;"> 'section': 0<br style="margin:0px;padding:0px;max-width:100%;"> }<br style="margin:0px;padding:0px;max-width:100%;"> response = requests.post(self.base_url, headers=self.headers, data=data)<br style="margin:0px;padding:0px;max-width:100%;"> if 200 == response.status_code:<br style="margin:0px;padding:0px;max-width:100%;"> self.parse_data(response)<br style="margin:0px;padding:0px;max-width:100%;"> except Exception as e:<br style="margin:0px;padding:0px;max-width:100%;"> print(e)<br style="margin:0px;padding:0px;max-width:100%;"> time.sleep(2)<br style="margin:0px;padding:0px;max-width:100%;"> pass<br style="margin:0px;padding:0px;max-width:100%;"><br style="margin:0px;padding:0px;max-width:100%;"> def parse_data(self, response):<br style="margin:0px;padding:0px;max-width:100%;"> dict_json = json.loads(response.text)<br style="margin:0px;padding:0px;max-width:100%;"> datas = dict_json['c']['list'] # 这里取得数据列表<br style="margin:0px;padding:0px;max-width:100%;"> # print(datas)<br style="margin:0px;padding:0px;max-width:100%;"> for data in datas:<br style="margin:0px;padding:0px;max-width:100%;"> source_name = data['audio_detail']['source_name']<br style="margin:0px;padding:0px;max-width:100%;"> title = data['audio_detail']['title']<br style="margin:0px;padding:0px;max-width:100%;"> icon = data['audio_detail']['icon']<br style="margin:0px;padding:0px;max-width:100%;"> share_title = data['audio_detail']['share_title']<br style="margin:0px;padding:0px;max-width:100%;"> mp3_url = data['audio_detail']['mp3_play_url']<br style="margin:0px;padding:0px;max-width:100%;"> duction = str(data['audio_detail']['duration']) + '秒'<br style="margin:0px;padding:0px;max-width:100%;"> size = data['audio_detail']['size'] / (1000 * 1000)<br style="margin:0px;padding:0px;max-width:100%;"> size = '%.2fM' % size<br style="margin:0px;padding:0px;max-width:100%;"><br style="margin:0px;padding:0px;max-width:100%;"> self.download_mp3(mp3_url)<br style="margin:0px;padding:0px;max-width:100%;"><br style="margin:0px;padding:0px;max-width:100%;"> self.audio_info.append(source_name)<br style="margin:0px;padding:0px;max-width:100%;"> self.audio_info.append(title)<br style="margin:0px;padding:0px;max-width:100%;"> self.audio_info.append(icon)<br style="margin:0px;padding:0px;max-width:100%;"> self.audio_info.append(share_title)<br style="margin:0px;padding:0px;max-width:100%;"> self.audio_info.append(mp3_url)<br style="margin:0px;padding:0px;max-width:100%;"> self.audio_info.append(duction)<br style="margin:0px;padding:0px;max-width:100%;"> self.audio_info.append(size)<br style="margin:0px;padding:0px;max-width:100%;"><br style="margin:0px;padding:0px;max-width:100%;"> self.count = self.count + 1<br style="margin:0px;padding:0px;max-width:100%;"> ExeclUtils.write_execl(self.execl_f, self.sheet_table, self.count, self.audio_info, u'逻辑思惟音频.xlsx')<br style="margin:0px;padding:0px;max-width:100%;"> print('采集了{}条数据'.format(self.count))<br style="margin:0px;padding:0px;max-width:100%;"> # 清空集合,为再次存放数据作准备<br style="margin:0px;padding:0px;max-width:100%;"> self.audio_info = []<br style="margin:0px;padding:0px;max-width:100%;"><br style="margin:0px;padding:0px;max-width:100%;"> time.sleep(3) # 不要请求太快, 当心查水表<br style="margin:0px;padding:0px;max-width:100%;"> max_id = datas[-1]['publish_time_stamp']<br style="margin:0px;padding:0px;max-width:100%;"> if self.max_id != max_id:<br style="margin:0px;padding:0px;max-width:100%;"> self.max_id = max_id<br style="margin:0px;padding:0px;max-width:100%;"> self.request_data()<br style="margin:0px;padding:0px;max-width:100%;"> else:<br style="margin:0px;padding:0px;max-width:100%;"> print('数据抓取完毕!')<br style="margin:0px;padding:0px;max-width:100%;"><br style="margin:0px;padding:0px;max-width:100%;"> pass<br style="margin:0px;padding:0px;max-width:100%;"><br style="margin:0px;padding:0px;max-width:100%;"> def download_mp3(self, mp3_url):<br style="margin:0px;padding:0px;max-width:100%;"> try:<br style="margin:0px;padding:0px;max-width:100%;"> # 补全文件目录<br style="margin:0px;padding:0px;max-width:100%;"> mp3_path = u'D:/store/mp3/{}'.format(mp3_url.split('/')[-1])<br style="margin:0px;padding:0px;max-width:100%;"> print(mp3_path)<br style="margin:0px;padding:0px;max-width:100%;"> # 判断文件是否存在。<br style="margin:0px;padding:0px;max-width:100%;"> if not os.path.exists(mp3_path):<br style="margin:0px;padding:0px;max-width:100%;"> # 注意这里是写入文件,要用二进制格式写入。<br style="margin:0px;padding:0px;max-width:100%;"> with open(mp3_path, 'wb') as f:<br style="margin:0px;padding:0px;max-width:100%;"> f.write(requests.get(mp3_url).content)<br style="margin:0px;padding:0px;max-width:100%;"><br style="margin:0px;padding:0px;max-width:100%;"> except Exception as e:<br style="margin:0px;padding:0px;max-width:100%;"> print(e)<br style="margin:0px;padding:0px;max-width:100%;"><br style="margin:0px;padding:0px;max-width:100%;"><br style="margin:0px;padding:0px;max-width:100%;">if __name__ == '__main__': d = dedao() d.request_data()