爬虫抓取网页数据(网页右键单击选择Viewsource选项(scraping)(组图) )
优采云 发布时间: 2021-10-14 13:01爬虫抓取网页数据(网页右键单击选择Viewsource选项(scraping)(组图)
)
我们需要让这个爬虫从每个网页中提取一些数据,然后实现某些东西。这种方法也称为刮擦。
2.1 分析页面
右键单击并选择查看页面源选项以获取网页源代码
2.2 三种网络爬虫方法
2.2.1 正则表达式
当我们使用正则表达式获取面积数据时,首先需要尝试匹配元素中w2p_fw的内容,如下图:
实现代码如下:
#!/usr/bin/env python3
# -*- coding:utf-8 -*-
import urllib.request
# 下载url网页,proxy是支持代理功能,初始值为None,想要设置就直接传参数即可
def download(url, user_agent = 'brain', proxy = None, num_retries = 2):
print ('Downloading:', url)
headers = {'User-agent':user_agent} # 设置用户代理,而不使用python默认的用户代理Python-urllib/3.6
request = urllib.request.Request(url,headers=headers)
opener = urllib.request.build_opener()
if proxy: # 如果设置了proxy,那么就进行以下设置以实现支持代理功能
proxy_params = {urllib.parse.urlparse(url).scheme: proxy}
opener.add_handler(urllib.request.ProxyHandler(proxy_params))
try:
html = opener.open(request).read()
except urllib.request.URLError as e: # 下载过程中出现问题
print('Download error:', e.reason)
html = None
if num_retries>0: # 错误4XX发生在请求存在问题,而5XX错误则发生在服务端存在问题,所以在发生5XX错误时重试下载
if hasattr(e,'code') and 500 (.*?)' % field, html)
results[field] = re.search('.*?0: # 错误4XX发生在请求存在问题,而5XX错误则发生在服务端存在问题,所以在发生5XX错误时重试下载
if hasattr(e,'code') and 500 0 and last_accessed is not None:
# 外部延时与访问时间间隔(当前访问时间及上次访问时间)比较
sleep_secs = self.delay - (datetime.datetime.now() - last_accessed).seconds
if sleep_secs > 0: # 访问时间间隔小于外部时延的话,执行睡眠操作
# domain has been accessed recently
# so need to sleep
time.sleep(sleep_secs)
# update the last accessed time
self.domains[domain] = datetime.datetime.now()
"""先下载 seed_url 网页的源代码,然后提取出里面所有的链接URL,接着对所有匹配到的链接URL与link_regex 进行匹配,
如果链接URL里面有link_regex内容,就将这个链接URL放入到队列中,下一次 执行 while crawl_queue: 就对这个链接URL进行同样的操作。
反反复复,直到 crawl_queue 队列为空,才退出函数。"""
def link_crawler(seed_url, link_regex, max_depth = 2, scrape_callback = None):
"""Crawl from the given seed URL following links matched by link_regex
"""
crawl_queue = [seed_url]
# keep track which URL's have seen before
# seen = set(crawl_queue)
seen = {seed_url: 0} # 初始化seed_url访问深度为0
while crawl_queue:
url = crawl_queue.pop()
# 爬取前解析网站robots.txt,检查是否可以爬取网站,避免爬取网站禁止或限制的
rp = urllib.robotparser.RobotFileParser()
rp.set_url(seed_url+'/robots.txt')
rp.read()
user_agent = 'brain'
if rp.can_fetch(user_agent,url): # 解析后发现如果可以正常爬取网站,则继续执行
# 爬取网站的下载限速功能的类的调用,每次在download下载前使用
throttle = Throttle(delay = 5) # 这里实例网站robots.txt中的delay值为5
throttle.wait(url)
html = download(url)
html = html.decode('utf-8') # 需要转换为UTF-8 / html = str(html)
links = []
if scrape_callback:
links.extend(scrape_callback(url, html) or [])
# filter for links matching our regular expression
if html == None:
continue
depth = seen[url] # 用于避免爬虫陷阱的记录爬取深度的depth
if depth != max_depth:
for link in get_links(html):
if re.match(link_regex, link):
link = urllib.parse.urljoin(seed_url,link)
if link not in seen:
# seen.add(link)
seen[link] = depth + 1 # 在之前的爬取深度上加1
crawl_queue.append(link)
else:
print("Blocked by %s robots,txt" % url)
continue
def get_links(html):
"""Return a list of links from html
"""
# a regular expression to extract all links from the webpage
webpage_regex = re.compile(']+href=["\'](.*?)["\']', re.IGNORECASE)
# list of all links from the webpage
return webpage_regex.findall(html)
FIELDS = ('area','population','iso','country','capital',
'continent', 'tld', 'currency_code', 'currency_name', 'phone',
'postal_code_format', 'postal_code_regex', 'languages', 'nei*敏*感*词*ours')
def scrape_callback(url, html):
if re.search('/view/', url):
tree = lxml.html.fromstring(html)
row = [tree.cssselect('tr#places_%s__row > td.w2p_fw' % field)[0].text_content() for field in FIELDS]
print(url,row)
import csv
class ScrapeCallback:
def __init__(self):
self.writer = csv.writer(open('countries.csv','w',newline='')) # 加入newline=”''这个参数后生成的文件就不会出现空行了
self.fields = FIELDS
self.writer.writerow(self.fields)
def __call__(self, url, html):
if re.search('/view/', url):
tree = lxml.html.fromstring(html)
row = []
for field in self.fields:
row.append(tree.cssselect('table > tr#places_{}__row > td.w2p_fw'.format(field))[0].text_content())
self.writer.writerow(row)
# 只想找http://example.webscraping.com/places/default/index... or http://example.webscraping.com/places/default/view...
link_crawler('http://example.webscraping.com', '/places/default'+'/(index|view)', max_depth=2, scrape_callback = ScrapeCallback())