网页抓取数据百度百科(1.分析目标本次实战篇的目的是什么?(图))

优采云 发布时间: 2022-04-15 14:33

  网页抓取数据百度百科(1.分析目标本次实战篇的目的是什么?(图))

  用python开发一个简单的爬虫:实战篇1.分析目标

  这篇实战文章的目的是抓取百度百科python入口页面的标题和介绍,以及与之关联的入口页面的标题和介绍。

  数据格式:

  页面编码:UTF-8 2.网址管理器

  代码如下:

  # coding:utf8

class UrlManager(object):

def __init__(self):

# 初始化待爬取url集合和已爬取url集合

self.new_urls = set()

self.old_urls = set()

# 添加一个新的url到new_urls

def add_new_url(self, url):

if url is None:

return

if url not in self.new_urls and url not in self.old_urls:

self.new_urls.add(url)

# 获取一个待爬取的url,并将此url添加到old_urls

def get_new_url(self):

new_url = self.new_urls.pop()

self.old_urls.add(new_url)

return new_url

# 判断是否还有待爬取的url

def has_new_url(self):

return len(self.new_urls) != 0

# 添加多个url到new_urls

def add_new_urls(self, urls):

if urls is None or len(urls) == 0:

return

for url in urls:

self.add_new_url(url)

  3. 网页下载器

  代码如下:

  # coding:utf8

import urllib2

class HtmlDownloader(object):

# 使用urllib2最简单的方法下载url页面内容

def download(self, url):

if url is None:

return None

resp = urllib2.urlopen(url)

if resp.getcode() != 200:

return None

return resp.read()

  4. 网页解析器

  代码如下:

  # coding:utf8

import re

import urlparse

from bs4 import BeautifulSoup

class HtmlParser(object):

# 得到页面相关的url

def _get_new_urls(self, page_url, soup):

new_urls = set()

# /view/123.htm

links = soup.find_all('a', href=re.compile(r'/view/\d+\.htm'))

for link in links:

new_url = link['href']

# 将/view/123.htm补充完整:http://baike.baidu.com/view/123.htm

new_full_url = urlparse.urljoin(page_url, new_url)

# 将解析到的unicode编码的网址转化为utf-8格式

new_urls.add(new_full_url.encode('utf-8'))

return new_urls

# 得到页面标题和简介

def _get_new_data(self, page_url, soup):

res_data = {}

# url

res_data['url'] = page_url

# 得到标题节点

# Python

title_node = soup.find('dd', class_='lemmaWgt-lemmaTitle-title').find('h1')

res_data['title'] = title_node.get_text()

# 得到简介节点

#

summary_node = soup.find('div', class_='lemma-summary')

res_data['summary'] = summary_node.get_text()

return res_data

# 对下载页面内容进行解析

def parse(self, page_url, html_cont):

if page_url is None or html_cont is None:

return None

soup = BeautifulSoup(html_cont, 'html.parser', from_encoding='utf-8')

new_urls = self._get_new_urls(page_url, soup)

new_data = self._get_new_data(page_url, soup)

return new_urls, new_data

  5.数据导出器

  代码如下:

  # coding:utf8

class HtmlOutputer(object):

def __init__(self):

self.datas = []

# 收集数据

def collect_data(self, data):

if data is None:

return None

self.datas.append(data)

# 将收集到的数据生成一个HTML页面输出

def output_html(self):

fout = open('output.html', 'w')

fout.write('')

fout.write('')

fout.write('')

fout.write('')

fout.write('')

fout.write('')

for data in self.datas:

fout.write('')

fout.write('%s' % data['url'].encode('utf-8'))

fout.write('%s' % data['title'].encode('utf-8'))

fout.write('%s' % data['summary'].encode('utf-8'))

fout.write('')

fout.write('')

fout.write('')

fout.write('')

fout.close()

  6.爬虫调度器

  # coding:utf8

from baike1 import url_manager, html_downloader, html_parser, html_outputer

class SpiderMain(object):

def __init__(self):

self.urls = url_manager.UrlManager()

self.downloader = html_downloader.HtmlDownloader()

self.parser = html_parser.HtmlParser()

self.outputer = html_outputer.HtmlOutputer()

def craw(self, url):

count = 1

self.urls.add_new_url(url)

while self.urls.has_new_url():

try:

new_url = self.urls.get_new_url()

print 'craw %d:%s' % (count, new_url)

html_cont = self.downloader.download(new_url)

new_urls, new_data = self.parser.parse(new_url, html_cont)

self.urls.add_new_urls(new_urls)

self.outputer.collect_data(new_data)

if count == 100:

break

count += 1

except:

print 'craw failed'

self.outputer.output_html()

if __name__ == "__main__":

root_url = 'http://baike.baidu.com/view/21087.htm'

obj_spider = SpiderMain()

obj_spider.craw(root_url)

  程序运行结果截图:

  

  至此,一个很简单的爬虫就完成了,撒花!

0 个评论

要回复文章请先登录注册


官方客服QQ群

微信人工客服

QQ人工客服


线