前程无忧网站内容(爬取前程无忧招聘信息本文,代码和数据仅供参考,学习 )
优采云 发布时间: 2021-10-20 16:21前程无忧网站内容(爬取前程无忧招聘信息本文,代码和数据仅供参考,学习
)
爬取51job招聘信息
本文为招聘数据爬取,所选网站为51job。
百度直接搜索51job或51job。我们会看到搜索栏,在搜索栏中输入“数据分析师”可以看到职位信息。
至于分析网站,这里就不解释了。这个爬虫只是爬取一点点数据,所以不会做伪装的爬虫机制。所以本文仅供参考。如果你真的想爬这个网站,请联系博主,我会写一篇详细的文章,以下是代码和数据,仅供参考。
# !/usr/bin/python
# -*- coding: utf-8 -*-
'''
@File : qianchengwu_crab.py
@Time : 2020/03/15 21:21:18
@Author : Qingxiang Zhang
@Version : 1.0
@Contact : 344285081@qq.com
@Desc :
@Software: Vscode
'''
import urllib
import requests
import re
import csv
import json
def main():
for i in range(1, 60):
print('正在爬取第{}页信息'.format(i))
baseurl = "https://search.51job.com/list/000000,000000,0130%252c7501%252c7506%252c7502,01%252c32%252c38,9,99,%25E6%2595%25B0%25E6%258D%25AE%25E5%2588%2586%25E6%259E%2590%25E5%25B8%2588,2,{}.html".format(i)#全国+keyword
html = askURL(baseurl)
# print(html)
# print(bs)
re_soup=re.search(r'window.__SEARCH_RESULT__ =(.*?)',html)
json_data=json.loads(re_soup.group(1))
# print(json_data)
for items in json_data["engine_search_result"]:
job_name=items["job_name"]
# print(job_name)
company_name=items["company_name"]
jobwelf=items["jobwelf"]
providesalary_text=items["providesalary_text"]
#存储成csv格式
with open("./result.csv","a",encoding="utf-8",newline="") as f:
csv_write=csv.writer(f)
csv_write.writerow([job_name,company_name,providesalary_text,jobwelf])
def askURL(url):
head = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36"
}
request = urllib.request.Request(url,headers=head)
html = ""
try:
response = urllib.request.urlopen(request)
html = response.read().decode('gbk', 'ignore')
# print(html)
except urllib.error.URLError as e:
if hasattr(e, "code"):
print(e.code)
if hasattr(e,"reason"):
print(e.reason)
return html
if __name__ == '__main__':
main()
数据样式: