c httpclient抓取网页(爬虫环境python3.7+pycharm()机制都没有 )

优采云发布时间: 2022-02-23 01:09

　　c httpclient抓取网页(爬虫环境python3.7+pycharm()机制都没有

)

　　爬行动物环境

　　python3.7+pycharm

　　最近发现了一个网站，首商网，里面有超过一百万的企业信息，但是网站根本没有反爬机制，对喜欢的我们来说是不是太爽了爬虫，直接拿我自己创建了一组代码，用了三个并发，每次20个线程，爬了五六个小时，得到了20万条数据，太棒了！

　　还是老规矩，代码直接在下面，所有的注释和解释都在代码里，直接运行即可：

　　for k in range(1, 1651, 50):

# -*- coding: utf-8 -*-

# 本项目是原始的异步爬虫，没有封装为函数

import asyncio

import aiohttp

import time

from bs4 import BeautifulSoup

import csv

import requests

from concurrent.futures import ThreadPoolExecutor, wait, ALL_COMPLETED

# 先用并发获取每个页面的子链接

########################################################################################################################

pro = 'zhaoshuang:LINA5201314@ 14.215.44.251:28803'

proxies = {'http://': 'http://' + pro,

'httpS://': 'https://' + pro

}

# 加入请求头

headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit'

'/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'}

wzs = []

def parser(url):

print(url)

try:

response = requests.get(url, headers=headers)

soup1 = BeautifulSoup(response.text, "lxml")

# body > div.list_contain > div.left > div.list_li > ul > li:nth-child(1) > table > tbody > tr > td:nth-child(3) > div.title > a

wz = soup1.select('div.title')

for i in wz:

wzs.append(i.contents[0].get("href"))

time.sleep(1)

except:

print('公司正在审核中')

urls = ['http://www.sooshong.com/c-3p{}'.format(num) for num in range(k, k + 50)]

# 利用并发加速爬取，最大线程为50个，本文章中一共有50个网站，可以加入50个线程

# 建立一个*敏*感*词*对象，线程数每个网站都不同，太大网站接受不了会造成数据损失

executor = ThreadPoolExecutor(max_workers=10)

# submit()的参数：第一个为函数，之后为该函数的传入参数，允许有多个

future_tasks = [executor.submit(parser, url) for url in urls]

# 等待所有的线程完成，才进入后续的执行

wait(future_tasks, return_when=ALL_COMPLETED)

print('子页链接抓取完毕！')

########################################################################################################################

# 使用并发法爬取详细页链接

# 定义函数获取每个网页需要爬取的内容

wzs1 = []

def parser(url):

# 利用正则表达式解析网页

try:

res = requests.get(url, headers=headers)

# 对响应体进行解析

soup = BeautifulSoup(res.text, "lxml")

# 找到页面子链接，进入子页面，对子页面进行抓取

# 用select函数抽取需要的内容，单击需要的内容》检查》copy select

lianjie = soup.select('#main > div.main > div.intro > div.intros > div.text > p > a')

lianjie = lianjie[0].get('href')

wzs1.append(lianjie)

print(lianjie)

except:

print('子页解析失败')

# 利用并发加速爬取，最大线程为50个，本文章中一共有50个网站，可以加入50个线程

# 建立一个*敏*感*词*对象，线程数每个网站都不同，太大网站接受不了会造成数据损失

executor = ThreadPoolExecutor(max_workers=10)

# submit()的参数：第一个为函数，之后为该函数的传入参数，允许有多个

future_tasks = [executor.submit(parser, url) for url in wzs]

# 等待所有的线程完成，才进入后续的执行

wait(future_tasks, return_when=ALL_COMPLETED)

print('详细页链接获取完毕！')

"""

# 使用异步法抓取子页面的链接

########################################################################################################################

async def get_html(sess, ur):

try:

proxy_auth = aiohttp.BasicAuth('zhaoshuang', 'LINA5201314')

html = await sess.get(ur,

headers=headers) # , proxy='http://'+'14.116.200.33:28803', proxy_auth=proxy_auth)

r = await html.text()

return r

except:

print("error")

# f = requests.get('http://211775.sooshong.com', headers=headers)

wzs1 = []

# 解析网页

async def parser(respo):

# 利用正则表达式解析网页

try:

# 对响应体进行解析

soup = BeautifulSoup(respo, "lxml")

# 找到页面子链接，进入子页面，对子页面进行抓取

# 用select函数抽取需要的内容，单击需要的内容》检查》copy select

lianjie = soup.select('#main > div.main > div.intro > div.intros > div.text > p > a')

lianjie = lianjie[0].get('href')

wzs1.append(lianjie)

print(lianjie)

company = soup.select("#main > div.aside > div.info > div.info_c > p:nth-child(1) > strong") # 标题

company = company[0].text

# 匹配电话号码

dianhua = soup.select("#main > div.aside > div.info > div.info_c > p:nth-child(3)") # 地址

dianhua = dianhua[0].text.split("：")[1]

# 匹配手机号码

phone = soup.select("#main > div.aside > div.info > div.info_c > p:nth-child(4)") # 日租价格

phone = phone[0].text.split("：")[1]

# 匹配传真

chuanzhen = soup.select("#main > div.aside > div.info > div.info_c > p:nth-child(5)") # 月租价格

chuanzhen = chuanzhen[0].text.split("：")[1]

# 经营模式

jingying = soup.select("#main > div.aside > div.info > div.info_c > p:nth-child(8)") # 面积大小

jingying = jingying[0].text.split("：")[1]

# 公司地址

address = soup.select('#main > div.aside > div.info > div.info_c > p:nth-child(9)') # 抽取建造年份

address = address[0].text.split("：")[1]

# 公司简介

# introduction = soup.select("#main > div.main > div.intro > div.intros > div.text > p") # 楼层属性

# introduction = introduction[0].text.strip()

data = [company, address, dianhua, phone, chuanzhen, jingying]

print(data)

with open('首富网企业7.csv', 'a+', newline='', encoding='GB2312', errors='ignore') as csvfile:

w1 = csv.writer(csvfile)

w1.writerow(data, [1])

except:

print("出错！")

async def main(loop):

async with aiohttp.ClientSession() as sess:

tasks = []

for ii in wzs:

ur = ii

try:

tasks.append(loop.create_task(get_html(sess, ur)))

except:

print('error')

# 设置0.1的网络延迟增加爬取效率

await asyncio.sleep(0.1)

finished, unfinised = await asyncio.wait(tasks)

for i1 in finished:

await parser(i1.result())

if __name__ == '__main__':

t1 = time.time()

loop = asyncio.get_event_loop()

loop.run_until_complete(main(loop))

print("花费时间", time.time() - t1)

print('详细页链接抓取完毕！')

"""

########################################################################################################################

# 使用并发法获取详细页的内容

########################################################################################################################

# 定义函数获取每个网页需要爬取的内容

def parser(url):

global data

try:

res = requests.get(url, headers=headers)

# 对响应体进行解析

soup = BeautifulSoup(res.text, 'lxml')

# 找到页面子链接，进入子页面，对子页面进行抓取

# 用select函数抽取需要的内容，单击需要的内容》检查》copy select

company = soup.select('#main > div.aside > div.info > div.info_c > p:nth-child(1) > strong')

company = company[0].text

name = soup.select('#main > div.aside > div.info > div.info_c > p:nth-child(2)')

name = name[0].text

dianhua = soup.select('#main > div.aside > div.info > div.info_c > p:nth-child(3)')

dianhua = dianhua[0].text.split('：')[1]

shouji = soup.select('#main > div.aside > div.info > div.info_c > p:nth-child(4)')

shouji = shouji[0].text.split('：')[1]

chuanzhen = soup.select('#main > div.aside > div.info > div.info_c > p:nth-child(5)')

chuanzhen = chuanzhen[0].text.split('：')[1]

product = soup.select('tr:nth-child(1) > td:nth-child(2)')

product = product[0].text

company_type = soup.select('tr:nth-child(2) > td:nth-child(2) > span')

company_type = company_type[0].text.strip()

legal_person = soup.select('tr:nth-child(3) > td:nth-child(2)')

legal_person = legal_person[0].text

main_address = soup.select('tr:nth-child(5) > td:nth-child(2) > span')

main_address = main_address[0].text

brand = soup.select('tr:nth-child(6) > td:nth-child(2) > span')

brand = brand[0].text

area = soup.select('tr:nth-child(9) > td:nth-child(2) > span')

area = area[0].text

industry = soup.select('tr:nth-child(1) > td:nth-child(4)')

industry = industry[0].text.strip()

address = soup.select('#main > div.aside > div.info > div.info_c > p:nth-child(9)')

address = address[0].text.split('：')[1]

jingying = soup.select('#main > div.aside > div.info > div.info_c > p:nth-child(8)')

jingying = jingying[0].text.split('：')[1]

date = soup.select('tr:nth-child(5) > td:nth-child(4) > span')

date = date[0].text

wangzhi = soup.select('tr:nth-child(12) > td:nth-child(4) > p > span > a')

wangzhi = wangzhi[0].text

data = [company, date, name, legal_person, shouji, dianhua, chuanzhen, company_type, jingying, industry,

product, wangzhi, area, brand, address, main_address] # 将以上数据放入列表中打印在命令框

print(data)

with open('服装1.csv', 'a', newline='', encoding='GB2312') as csvfile:

w1 = csv.writer(csvfile)

w1.writerow(data)

except:

with open('服装2.csv', 'a', newline='', encoding='utf-8-sig') as csvfile:

w1 = csv.writer(csvfile)

w1.writerow(data)

print('utf解码成功')

# 利用并发加速爬取，最大线程为50个，本文章中一共有50个网站，可以加入50个线程

# 建立一个*敏*感*词*对象，线程数每个网站都不同，太大网站接受不了会造成数据损失

executor = ThreadPoolExecutor(max_workers=10)

# submit()的参数：第一个为函数，之后为该函数的传入参数，允许有多个

future_tasks = [executor.submit(parser, url) for url in wzs1]

# 等待所有的线程完成，才进入后续的执行

wait(future_tasks, return_when=ALL_COMPLETED)

print('全部信息抓取完毕')

0

2022-02-23

c httpclient抓取网页

0 个评论

要回复文章请先登录或注册

AI时代内容工厂

c httpclient抓取网页(爬虫环境python3.7+pycharm()机制都没有 )

0 个评论

发起人