根据关键字,采集前十大二级域名

优采云 发布时间: 2020-08-08 12:15

  成就

  根据关键字文件中的关键字(每行一个),找出百度中每个关键字的前十大二级域名:

  通过这种方式,如果输入的是行业关键词,则可以快速找到SEO的竞争对手,这也是对竞争对手进行更粗暴的监控

  代码1

  直接采集搜索结果出现的地址

  

  采集1.png

  #!/usr/bin/env python

# -*- coding: utf-8 -*-

#encoding=utf-8

from __future__ import division

import time

import requests #用于web页面抓取,很简单,需要单独安装,安装很简单。

import re

from bs4 import BeautifulSoup

kws=open(r'C:\Users\Administrator\Desktop\keywords.txt','r') #盘符可以任意修改,注意不能有空行,初始url文本

morekwsall=open(r'C:\Users\Administrator\Desktop\jzdsbs4.txt','w+',encoding='utf-8')#注意编码

headers1={

'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',

'Accept-Encoding':'gzip,deflate',

'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_0) AppleWebKit/537.36 (KHTML,like Gecko) Chrome/31.0.1650.63 Safari/537.36'

'Refer':'http://www.hao123.com/',

'Cache-Control':'max-age=0',

'Cookie':'BAIDUID=8A69D49EC2D3B576D17277579AEDA83F:FG=1; BIDUPSID=8A69D49EC2D3B576D17277579AEDA83F; PSTM=1463384511; ' \

'BDUSS=2ZESWpnckRaeHRkVjdqN3dJa0RINWNrSDJTbm1sRFVlRUMzelNCVTBwSEliY0ZYQVFBQUFBJCQAAAAAAAAAAAEAAAAVcMhIYWFzZDEzM' \

'jM0AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAMjgmVfI4JlXY; ' \

'uc_login_unique=f62bbe2bfc4a840aa437e67dcde239a3; BD_HOME=1; BDRCVFR[S4-dAuiWMmn]=I67x6TjHwwYf0; B64_BOT=1; ' \

'BD_UPN=12314353; sug=3; sugstore=1; ORIGIN=2; bdime=0; ' \

'BDRCVFR[feWj1Vr5u3D]=I67x6TjHwwYf0; BD_CK_SAM=1; H_PS_PSSID=1428_18280_21116_20592_21189_21160_20929; BDSVRTM=0'

}

kwsline=kws.readlines()

for kw in kwsline:

utfline= kw.strip()

utzfline=utfline

time.sleep(2) # 还是不能太猖獗

o = 0

try:

urlbaidu1=str('http://www.baidu.com/baidu?wd='+utzfline+'&tn=monline_4_dg')

rbaidu1 = requests.get(urlbaidu1,headers= headers1,allow_redirects = False)

print(rbaidu1.status_code)

soup = BeautifulSoup(rbaidu1.content,'lxml')

result_urls = soup.select('.c-showurl')

for result_url in result_urls:

final_result = result_url.get_text().split('/')[0].split('-')[0].replace('\n', '')\

.replace(' ', '').replace('...', '').replace('...', '')

print(final_result)

morekwsall.write(final_result+'\n')

except Exception as e:

o = o+1

print(e)

time.sleep(100)

kwsline.append(kw)

finally:

print(o)

kws.close()

morekwsall.close()

print(u'结束!!!')

  注意:

  如果请求中有错误,请等待100秒,然后将错误地址重新插入列表以继续采集

  50错误,等待100秒,然后采集50

   except Exception as e:

o = o+1

print(e)

time.sleep(100)

kwsline.append(kw)

  代码2:

  获取百度跳转后搜索结果的实际地址. .

  

  ![采集3.png](%7CimageView2 / 2 / w / 1240)

  #!/usr/bin/env python

# -*- coding: utf-8 -*-

#encoding=utf-8

from __future__ import division

import time

import requests #用于web页面抓取,很简单,需要单独安装,安装很简单。

import re

from bs4 import BeautifulSoup

from urllib import request

kws=open(r'C:\Users\Administrator\Desktop\keywords.txt','r') #盘符可以任意修改,注意不能有空行,初始url文本

morekwsall=open(r'C:\Users\Administrator\Desktop\jzds.txt','w+',encoding='utf-8')#注意编码

morekwserror=open(r'C:\Users\Administrator\Desktop\jzds_error.txt','w+',encoding='utf-8')

headers1={

'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',

'Accept-Encoding':'gzip,deflate',

'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_0) AppleWebKit/537.36 (KHTML,like Gecko) Chrome/31.0.1650.63 Safari/537.36'

'Refer':'http://www.hao123.com/',

'Cache-Control':'max-age=0',

'Cookie':'BAIDUID=8A69D49EC2D3B576D17277579AEDA83F:FG=1; BIDUPSID=8A69D49EC2D3B576D17277579AEDA83F; PSTM=1463384511; ' \

'BDUSS=2ZESWpnckRaeHRkVjdqN3dJa0RINWNrSDJTbm1sRFVlRUMzelNCVTBwSEliY0ZYQVFBQUFBJCQAAAAAAAAAAAEAAAAVcMhIYWFzZDEzM' \

'jM0AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAMjgmVfI4JlXY; ' \

'uc_login_unique=f62bbe2bfc4a840aa437e67dcde239a3; BD_HOME=1; BDRCVFR[S4-dAuiWMmn]=I67x6TjHwwYf0; B64_BOT=1; ' \

'BD_UPN=12314353; sug=3; sugstore=1; ORIGIN=2; bdime=0; ' \

'BDRCVFR[feWj1Vr5u3D]=I67x6TjHwwYf0; BD_CK_SAM=1; H_PS_PSSID=1428_18280_21116_20592_21189_21160_20929; BDSVRTM=0'

}

kwsline=kws.readlines()

for kw in kwsline:

utfline= kw.strip()

utzfline=utfline

urlbaidu1=str('http://www.baidu.com/baidu?wd='+utzfline+'&tn=monline_4_dg')

try:

rbaidu1 = requests.get(urlbaidu1,headers=headers1)

soup = BeautifulSoup(rbaidu1.content,'lxml')

result_urls = soup.select('div > h3 > a')

for result_url in result_urls:

#time.sleep(0.5)# 还是不能太猖獗

try:

with request.urlopen(result_url.get('href')) as f:

real_url = f.geturl()# 会有访问错误,所以获取失败实际地址失败,那失败应该怎么继续获得?

except Exception as e:

print("@@@")

print(result_url)

morekwserror.write(str(e) +':'+str(result_url) + '\n')

print(e)

real_url=('///')

print(real_url.split('/')[2])

finally:

real_url2 = real_url.split('/')[2]

print(real_url2)

morekwsall.write(real_url2 + '\n')

except Exception as e:

print(e)

print("!!!!")

time.sleep(50)

kwsline.append(kw)

kws.close()

morekwsall.close()

print(u'结束!!!')

  再尝试一次,因为

  在最终状态下读取403和404页时,

  request.urlopen和f.geturl()会出错;

  我想至少采集错误的地址,但是失败了,所以我暂时跳过它.

  在第二次try50(开始51采集)中发生错误

0 个评论

要回复文章请先登录注册


官方客服QQ群

微信人工客服

QQ人工客服


线