前程无忧网站内容(前程无忧网站,职位信息一步到位函数爬取!!!!!)

优采云 发布时间: 2021-12-09 16:19

  前程无忧网站内容(前程无忧网站,职位信息一步到位函数爬取!!!!!)

  网站,招聘信息一步到位!!!真的一步到位

  又到毕业季了,阿爸阿爸。那么对于您要去的地方有什么好的计划呢?最好爬取一些仓位的数据来分析分析。看大佬网站,要么资料少,要么很难得到你想要的。不过,我一眼就看到了51job Worry的网站。这个网站数据量大,资料齐全。不算太差。

  1、网页分析

  

  先搜索少量数据,我们要获取总页数,即圈出的东西,然后去详情页获取更多数据:

  

  即帧中的所有数据

  2、那就废话少说,直接上代码:

  def word(word=None):

import requests

from lxml import etree

import pandas as pd

from requests.auth import HTTPBasicAuth

import re

from lxml import etree

import time

headers ={

}

headers[ 'User-Agent'] = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36'

b='adv=adsnew%3D1%26%7C%26adsresume%3D1%26%7C%26adsfrom%3Dhttps%253A%252F%252Fwww.baidu.com%252Fbaidu.php%253Fsc.Ks00000EAMrnlPLIybYNddmhQxV6IThZIe70Sr8VGpPGAfEzIiBE3N8p4LE2LEWCAHyzBdESmMZ-sGbi0sHVd6dhuzz4rfykOQ6N80T4NJgVb2RtQINK4AnOhKIekZ45BUl7eB4IbpXpwye5vkWgttCVOuz2P80--SJceSvd9MaxBTyepp5o5WQo-14LrX_R3tZ_9JO3m-wU6DsC4Er88NrA9MDF.7R_NR2Ar5Od66CHnsGtVdXNdlc2D1n2xx81IZ76Y_u2qS1x3dqyT8P9MqOOgujSOODlxdlPqKMWSxKSgqjlSzOFqtZOmzUlZlS5S8QqxZtVAOtItISiel5SKkOeZwSEHNuxSLsfxzcEqolQOY2LOqWq8WvLxVOgVlqoAzAVPBxZub88zuQCUS1FIEtR_4n-LiMGePOBGyAp7WFEePl6.U1Y10ZDqdIjA80Kspynqn0KsTv-MUWY1njNBujuhnvwBm17BnvcLmyD1mhRYnvubuW01PvDdnsKY5IgfkoBe3fKGUHYznWR0u1dEuZCk0ZNG5yF9pywd0ZKGujYzPsKWpyfqnWfY0AdY5HDsnH-xnH0kPdtznjmzg1Dknjfkg1nsnjuxn1msnfKopHYs0ZFY5HD3P6K-pyfqn104P1KxnHfdnNtznHDkndt1nj6Yn7t1njb4PNt1nj6zndt1njTkPsKBpHYkPH9xnW0Yg1ckPsKVm1Yknj0kg1D3Pjbknjm4rHNxrHczn1T3PWm3P-tYn7tknjFxn0KkTA-b5H00TyPGujYs0ZFMIA7M5H00mycqn7ts0ANzu1Ys0ZKs5H00UMus5H08nj0snj0snj00Ugws5H00uAwETjYs0ZFJ5H00uANv5gKW0AuY5H00TA6qn0KET1Ys0AFL5HDs0A4Y5H00TLCq0A71gv-bm1dsTzdMXh93XfKGuAnqiD4K0ZwdT1Ykn1f3nHbvrjTsPWD1PW6Lnjmvn6Kzug7Y5HDdrjnvPjDYrHD3nHf0Tv-b5yPBPH63ujuBnj0sPjT3Pvf0mLPV5HbvPjPDwWIjnYuarHIjnWb0mynqnfKYIgfqnfKsUWYs0Z7VIjYs0Z7VT1Ys0ZGY5H00UyPxuMFEUHYsg1Kxn7tsg1nsnH7xn0Kbmy4dmhNxTAk9Uh-bT1Ysg1Kxn7tznjRznHKxPjc1PjnvnNts0ZK9I7qhUA7M5H00uAPGujYs0ANYpyfqQHD0mgPsmvnqn0KdTA-8mvnqn0KkUymqn0KhmLNY5H00pgPWUjYs0ZGsUZN15H00mywhUA7M5H60UAuW5H00uAPWujY0mhwGujdKPjIjfH-AP1DdwjwDwRcYn1uDPRPafbFAPj0kPWPDwfKBIjYs0AqY5H00ULFsIjYsc10Wc10Wnansc108nj0snj0sc10Wc100TNqv5H08rH9xna3sn7tsQW0sg108nWIxna3sPdtsQWnY0AN3IjYs0APzm1YzPHDkn6%2526word%253D%2525E6%25258B%25259B%2525E8%252581%252598%2526ck%253D2539.10.112.366.290.220.175.400%2526shh%253Dwww.baidu.com%2526us%253D1.26594.3.0.2.929.0%2526bc%253D110101%26%7C%26adsnum%3D2198971; guid=80fce931b0d504e6510c34969654cfd2; partner=www_baidu_com; nsearch=jobarea%3D%26%7C%26ord_field%3D%26%7C%26recentSearch0%3D%26%7C%26recentSearch1%3D%26%7C%26recentSearch2%3D%26%7C%26recentSearch3%3D%26%7C%26recentSearch4%3D%26%7C%26collapse_expansion%3D; search=jobarea%7E%60000000%7C%21ord_field%7E%600%7C%21recentSearch0%7E%60000000%A1%FB%A1%FA000000%A1%FB%A1%FA0000%A1%FB%A1%FA00%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA9%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA0%A1%FB%A1%FA%D0%C2%C3%BD%CC%E5%C4%DA%C8%DD%D4%CB%D3%AA%A1%FB%A1%FA2%A1%FB%A1%FA1%7C%21recentSearch1%7E%60200200%A1%FB%A1%FA000000%A1%FB%A1%FA0000%A1%FB%A1%FA00%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA9%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA0%A1%FB%A1%FA%C3%BD%CC%E5%CA%FD%BE%DD%B7%D6%CE%F6%A1%FB%A1%FA2%A1%FB%A1%FA1%7C%21; m_search=keyword%3D%E6%96%B0%E5%AA%92%E4%BD%93%E5%86%85%E5%AE%B9%E8%BF%90%E8%90%A5%26%7C%26areacode%3D000000'

line=b.split(';')

cookie={

}

for i in line:

key,value = i.split('=',1)

cookie[key] = value###得到cookies

title=[]#标题

num_1=[]#字段1

num_2=[]#字段2

num_3=[]#字段3

num_4=[]#字段4

num_5=[]#字段5

num_6=[]#字段6

num_7=[]#字段7

num_al=[]#字段8、9、10

url='https://search.51job.com/list/000000,000000,0000,00,9,99,'+word+',2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare='###首先需要得到总页码

response=requests.get(url,headers=headers)

html=etree.HTML(response.text)

pag=re.sub('\xa0|/|\n|\r|\s','',html.xpath('//*[@id="resultList"]/div[2]/div[5]/text()')[2]##处理得到的页码

for i in list(range(1,int(pag)+1)):

url='https://search.51job.com/list/000000,000000,0000,00,9,99,'+word+',2,'+str(i)+'.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare='###网页的构建

response=requests.get(url,headers=headers)

response.encoding='gbk'

res=response.text

html = etree.HTML(res)

url_son=[]

for m in html.xpath('//p/span/a[@target="_blank"]/@href'):

url_son=url_son+re.findall('https://jobs.51job.com.*?\?s=01&t=0',m)###进行网址的筛选,因为有的网站是坏的,但是好的网站会有规律。

for j in range(len(url_son)):

res_son=requests.get(url_son[j],headers=headers)

res_son.encoding='gbk'

html_son = etree.HTML(res_son.text)

title.append(''.join(html_son.xpath('//div[@class="cn"]/h1/text()')))

num_1.append(''.join(html_son.xpath('//p[@class="cname"]/a/@title')))

num_2.append(''.join(html_son.xpath('//div[@class="cn"]/strong/text()')))

num_3.append(''.join(html_son.xpath('//div[@class="cn"]/p/@title')))

num_4.append(''.join(html_son.xpath('//span[@class="sp4"]/text()')))

num_5.append(''.join(html_son.xpath('//div[@class="bmsg job_msg inbox"]/p/text()')))

num_6.append(''.join(html_son.xpath('//div[@class="bmsg inbox"]/p/text()')))

num_7.append(''.join(html_son.xpath('//div[@class="tmsg inbox"]/text()')))

num_al.append(html_son.xpath('//div[@class="com_tag"]/p/@title'))

print ('第'+str(i)+'页')

time.sleep(5)

for i in range(len(num_3)):

num_3[i]=re.sub('\xa0','',num_3[i])

columns={

'标题':title,'字段1':num_1,'字段2':num_2,'字段3':num_3,'字段4':num_4,'字段5':num_5,'字段6':num_6,'字段7':num_7,'字段合':num_al}

df=pd.DataFrame(columns)

num_alnew=[]

for i in range(len(df['字段合'])):

num_alnew.append('|'.join(df['字段合'][i]))

df['新合']=num_alnew

data=df.drop(['字段合'],axis=1)

data['字段8']=data['新合'].str.split('|',expand=True)[0]

data['字段9']=data['新合'].str.split('|',expand=True)[1]

data['字段10']=data['新合'].str.split('|',expand=True)[2]

data_finall=data.drop(['新合'],axis=1)###以上皆为数据的处理

return data_finall.to_excel('C:\\Users\\dell\\Desktop\\'+word+'.xlsx',encoding='utf_8_sig')###导出数据注意修改格式

  开始攀登

  word('网络安全专员')

  得到答案:

  

  得到的结果并不完美。

  3、总结

  在很多错误中完善自己,然后继续优化代码,这样你就会越来越好。

  注:本次采集的数据仅供学习研究使用

0 个评论

要回复文章请先登录注册


官方客服QQ群

微信人工客服

QQ人工客服


线