爬虫抓取网页数据(网页右键单击选择Viewsource选项(scraping)(组图) )

优采云 发布时间: 2021-10-14 13:01

  爬虫抓取网页数据(网页右键单击选择Viewsource选项(scraping)(组图)

)

  我们需要让这个爬虫从每个网页中提取一些数据,然后实现某些东西。这种方法也称为刮擦。

  2.1 分析页面

  右键单击并选择查看页面源选项以获取网页源代码

  2.2 三种网络爬虫方法

  2.2.1 正则表达式

  当我们使用正则表达式获取面积数据时,首先需要尝试匹配元素中w2p_fw的内容,如下图:

  

  实现代码如下:

  #!/usr/bin/env python3

# -*- coding:utf-8 -*-

import urllib.request

# 下载url网页,proxy是支持代理功能,初始值为None,想要设置就直接传参数即可

def download(url, user_agent = 'brain', proxy = None, num_retries = 2):

print ('Downloading:', url)

headers = {'User-agent':user_agent} # 设置用户代理,而不使用python默认的用户代理Python-urllib/3.6

request = urllib.request.Request(url,headers=headers)

opener = urllib.request.build_opener()

if proxy: # 如果设置了proxy,那么就进行以下设置以实现支持代理功能

proxy_params = {urllib.parse.urlparse(url).scheme: proxy}

opener.add_handler(urllib.request.ProxyHandler(proxy_params))

try:

html = opener.open(request).read()

except urllib.request.URLError as e: # 下载过程中出现问题

print('Download error:', e.reason)

html = None

if num_retries>0: # 错误4XX发生在请求存在问题,而5XX错误则发生在服务端存在问题,所以在发生5XX错误时重试下载

if hasattr(e,'code') and 500 (.*?)' % field, html)

results[field] = re.search('.*?0: # 错误4XX发生在请求存在问题,而5XX错误则发生在服务端存在问题,所以在发生5XX错误时重试下载

if hasattr(e,'code') and 500 0 and last_accessed is not None:

# 外部延时与访问时间间隔(当前访问时间及上次访问时间)比较

sleep_secs = self.delay - (datetime.datetime.now() - last_accessed).seconds

if sleep_secs > 0: # 访问时间间隔小于外部时延的话,执行睡眠操作

# domain has been accessed recently

# so need to sleep

time.sleep(sleep_secs)

# update the last accessed time

self.domains[domain] = datetime.datetime.now()

"""先下载 seed_url 网页的源代码,然后提取出里面所有的链接URL,接着对所有匹配到的链接URL与link_regex 进行匹配,

如果链接URL里面有link_regex内容,就将这个链接URL放入到队列中,下一次 执行 while crawl_queue: 就对这个链接URL进行同样的操作。

反反复复,直到 crawl_queue 队列为空,才退出函数。"""

def link_crawler(seed_url, link_regex, max_depth = 2, scrape_callback = None):

"""Crawl from the given seed URL following links matched by link_regex

"""

crawl_queue = [seed_url]

# keep track which URL's have seen before

# seen = set(crawl_queue)

seen = {seed_url: 0} # 初始化seed_url访问深度为0

while crawl_queue:

url = crawl_queue.pop()

# 爬取前解析网站robots.txt,检查是否可以爬取网站,避免爬取网站禁止或限制的

rp = urllib.robotparser.RobotFileParser()

rp.set_url(seed_url+'/robots.txt')

rp.read()

user_agent = 'brain'

if rp.can_fetch(user_agent,url): # 解析后发现如果可以正常爬取网站,则继续执行

# 爬取网站的下载限速功能的类的调用,每次在download下载前使用

throttle = Throttle(delay = 5) # 这里实例网站robots.txt中的delay值为5

throttle.wait(url)

html = download(url)

html = html.decode('utf-8') # 需要转换为UTF-8 / html = str(html)

links = []

if scrape_callback:

links.extend(scrape_callback(url, html) or [])

# filter for links matching our regular expression

if html == None:

continue

depth = seen[url] # 用于避免爬虫陷阱的记录爬取深度的depth

if depth != max_depth:

for link in get_links(html):

if re.match(link_regex, link):

link = urllib.parse.urljoin(seed_url,link)

if link not in seen:

# seen.add(link)

seen[link] = depth + 1 # 在之前的爬取深度上加1

crawl_queue.append(link)

else:

print("Blocked by %s robots,txt" % url)

continue

def get_links(html):

"""Return a list of links from html

"""

# a regular expression to extract all links from the webpage

webpage_regex = re.compile(']+href=["\'](.*?)["\']', re.IGNORECASE)

# list of all links from the webpage

return webpage_regex.findall(html)

FIELDS = ('area','population','iso','country','capital',

'continent', 'tld', 'currency_code', 'currency_name', 'phone',

'postal_code_format', 'postal_code_regex', 'languages', 'nei*敏*感*词*ours')

def scrape_callback(url, html):

if re.search('/view/', url):

tree = lxml.html.fromstring(html)

row = [tree.cssselect('tr#places_%s__row > td.w2p_fw' % field)[0].text_content() for field in FIELDS]

print(url,row)

import csv

class ScrapeCallback:

def __init__(self):

self.writer = csv.writer(open('countries.csv','w',newline='')) # 加入newline=”''这个参数后生成的文件就不会出现空行了

self.fields = FIELDS

self.writer.writerow(self.fields)

def __call__(self, url, html):

if re.search('/view/', url):

tree = lxml.html.fromstring(html)

row = []

for field in self.fields:

row.append(tree.cssselect('table > tr#places_{}__row > td.w2p_fw'.format(field))[0].text_content())

self.writer.writerow(row)

# 只想找http://example.webscraping.com/places/default/index... or http://example.webscraping.com/places/default/view...

link_crawler('http://example.webscraping.com', '/places/default'+'/(index|view)', max_depth=2, scrape_callback = ScrapeCallback())

0 个评论

要回复文章请先登录注册


官方客服QQ群

微信人工客服

QQ人工客服


线