网页抓取数据百度百科(使用Python语言,编写简单的爬虫程序-html解析器 )
优采云 发布时间: 2022-02-03 11:18网页抓取数据百度百科(使用Python语言,编写简单的爬虫程序-html解析器
)
使用Python语言,编写一个简单的爬虫程序,爬取百度百科页面上与词条“Python”相关的1000个页面(页数可以自己任意设置)。
这个简单的爬虫项目可以分为5个模块:
首先需要有一个程序入口spider_main,它也作为爬虫的通用控制程序(调度器),
程序需要初始化:
self.urls = url_manager.UrlManger()
self.downloader = html_downloader.HtmlDownloader()
self.parser = html_parser.HtmlParser()
self.outputer = html_outputer.HtmlOutputer()
可见这里需要一个url管理器、html下载器、html解析器、html分析结果输出器。其中,程序语句
1 self.urls = url_manager.UrlManger()
2
例如,url_manager 为模块名(module),UrlManger 为模块中的类。
Beautiful Soup 提供了简单的类似 python 的函数来处理导航、搜索、修改解析树等。它是一个工具箱,通过解析文档为用户提供他们需要抓取的数据。由于其简单性,编写完整的应用程序不需要太多代码。Beautiful Soup 自动将输入文档转换为 Unicode 编码,将输出文档自动转换为 utf-8 编码。不需要考虑编码方式,除非文档没有指定编码方式,否则Beautiful Soup无法自动识别编码方式。然后,您只需要指定原创编码。Beautiful Soup 已经成为与 lxml 和 html6lib 一样优秀的 python 解释器,为用户提供不同解析策略的灵活性或强大的速度。
出于演示目的,对 1000 页的内容进行了抓取和初步分析。如果学生有兴趣,可以在任意页面进行内容爬取分析。该程序具有通用性,只需简单的修改,任何网页的内容都可以自由抓取。
下面,我们展示五个模块的对应代码:
1#coding:utf-8
2from baike_spider import html_downloader, url_manager, html_parser,\
3 html_outputer
4
5
6class SpiderMain(object):
7 def __init__(self):
8 self.urls = url_manager.UrlManger()
9 self.downloader = html_downloader.HtmlDownloader()
10 self.parser = html_parser.HtmlParser()
11 self.outputer = html_outputer.HtmlOutputer()
12
13
14
15 def craw(self, root_url):
16
17 count = 1
18 self.urls.add_new_url(root_url)
19 while self.urls.has_new_url():
20 try:
21 new_url= self.urls.get_new_url()
22 print 'craw %d : %s' %(count, new_url)
23 html_cont = self.downloader.download(new_url)
24 new_urls, new_data = self.parser.parse(new_url, html_cont)
25 self.urls.add_new_urls(new_urls)
26 self.outputer.collect_data(new_data)
27
28 if count == 10:
29 break
30
31 count = count + 1;
32
33 except:
34 print 'craw failed'
35
36 self.outputer.output_html()
37
38
39if __name__ == "__main__":
40 root_url = "http://baike.baidu.com/view/21087.htm"
41 obj_spider = SpiderMain()
42 obj_spider.craw(root_url)
43
44
1#coding:utf-8
2class UrlManger(object):
3
4 def __init__(self):
5 self.new_urls = set()
6 self.old_urls = set()
7
8 def add_new_url(self, url):
9 if url is None:
10 return
11 if url not in self.new_urls and url not in self.old_urls:
12 self.new_urls.add(url)
13
14
15 def add_new_urls(self, urls):
16 if urls is None or len(urls) == 0 :
17 return
18 for url in urls :
19 self.add_new_url(url)
20
21
22 def has_new_url(self):
23 return len(self.new_urls) != 0
24
25
26 def get_new_url(self):
27 new_url = self.new_urls.pop()
28 self.old_urls.add(new_url)
29 return new_url
30
31
32
1#coding:utf-8
2import urllib2
3class HtmlDownloader(object):
4
5
6 def download(self, url):
7 if url is None:
8 return None
9
10 response = urllib2.urlopen(url)
11
12 if response.getcode() != 200:
13 return None
14
15 return response.read()
16
17
1#coding:utf-8
2from bs4 import BeautifulSoup
3import re
4import urlparse
5
6class HtmlParser(object):
7
8
9 def _get_new_urls(self, page_url, soup):
10 new_urls = set()
11 links = soup.find_all('a', href=re.compile(r"/view/\d+\.htm"))
12 for link in links :
13 new_url = link['href']
14 new_full_url = urlparse.urljoin(page_url, new_url)
15 new_urls.add(new_full_url)
16 return new_urls
17
18
19
20 def _get_new_data(self, page_url, soup):
21 res_data = {}
22 res_data['url'] = page_url
23# Python
24 title_node = soup.find('dd', class_="lemmaWgt-lemmaTitle-title").find("h1")
25 res_data['title'] = title_node.get_text()
26#
27 summary_node = soup.find('div', class_="lemma-summary")
28 res_data['summary'] = summary_node.get_text()
29
30 return res_data
31
32 def parse(self, page_url, html_cont):
33 if page_url is None or html_cont is None :
34 return
35
36 soup = BeautifulSoup(html_cont, 'html.parser', from_encoding='utf-8')
37 new_urls = self._get_new_urls(page_url, soup)
38 new_data = self._get_new_data(page_url, soup)
39 return new_urls, new_data
40
1#coding:utf-8
2class HtmlOutputer(object):
3
4 def __init__(self):
5 self.datas = []
6
7 def collect_data(self, data):
8 if data is None :
9 return
10 self.datas.append(data)
11
12 def output_html(self):
13 fout = open('output.html', 'w')
14 fout.write("")
15 fout.write("")
16 fout.write("")
17
18 count = 1
19 for data in self.datas :
20
21 fout.write("")
22 fout.write("%d" % count)
23 fout.write("%s" % data['url'])
24 fout.write("%s" % data['title'].encode('utf-8'))
25 fout.write("%s" % data['summary'].encode('utf-8'))
26 fout.write("/")
27 count = count + 1
28
29 fout.write("")
30 fout.write("")
31 fout.write("")
32 fout.close()
33