采集文章免费(免费代理稳定性连同时切换代理附代理采集下载下载地址)

优采云 发布时间: 2021-09-25 18:36

  采集文章免费(免费代理稳定性连同时切换代理附代理采集下载下载地址)

  1、代理文档格式:(代理采集地址)

  

  2、自由体的稳定性不可靠。装饰器用于同时重新连接和切换代理

  # coding: utf-8

# pyhotn 2.7

# 小说棋 单篇小说采集 http://www.xs7.la/

# 替换第一章地址,总章节数。

# ip.txt 为代理池。

import urllib2

from bs4 import BeautifulSoup

import sys

import traceback

import random

import gzip

reload(sys)

sys.setdefaultencoding('utf-8')

f = open("out.txt", "a+")

headers = {

"Host": "www.xs7.la",

"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",

"X-Requested-With": "XMLHttpRequest",

"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36",

"Content-Type": "text/html",

"Connection": "keep-alive",

"Upgrade-Insecure-Requests": "1",

"Referer": "http://www.xs7.la/book/18_18966/",

"Accept-Encoding": 'deflat'

}

url = "http://www.xs7.la/book/18_18966/7828246.html" # 第一章网址

page = 184 # 章节数

nextHref = url

ipPool = []

def IPpool():

reader = open('ip.txt')

line = reader.readline()

while line:

if line.strip() != '':

ipPool.append(line.split())

line = reader.readline()

reader.close()

RETRIES = 0

# 重试的次数

count = {"num": RETRIES}

def conn_try_again(function):

def wrapped(*args, **kwargs):

try:

return function(*args, **kwargs)

except Exception, err:

print("--重试访问,当前次数 %s ,(总次数11)--" % (count['num'] + 1))

if count[&#39;num&#39;] < 10:

count[&#39;num&#39;] += 1

return wrapped(*args, **kwargs)

else:

raise Exception(err)

return wrapped

bsObj = None

#判断编码格式

def getCoding(strInput):

&#39;&#39;&#39;

获取编码格式

&#39;&#39;&#39;

if isinstance(strInput, unicode):

return "unicode"

try:

strInput.decode("utf8")

return &#39;utf8&#39;

except:

pass

try:

strInput.decode("gbk")

return &#39;gbk&#39;

except:

pass

@conn_try_again

def getContent(url):

global nextHref, page, bsObj

# 定义一个代理开关

proxySwitch = True

try:

poolLen = len(ipPool)

if (poolLen > 0):

i = random.randint(0, poolLen - 1)

print(ipPool[i])

proxy_host = ipPool[i][2] + "://" + ipPool[i][0] + ":" + ipPool[i][1]

proxy_temp = {ipPool[i][2]: proxy_host}

proxy_support = urllib2.ProxyHandler(proxy_temp)

else:

print(&#39;--代理池当前无可用代理,使用本机地址访问--&#39;)

proxy_support = urllib2.ProxyHandler({})

nullproxy_handler = urllib2.ProxyHandler({"http": "124.172.232.49:8010"})

if proxySwitch:

opener = urllib2.build_opener(proxy_support)

else:

opener = urllib2.build_opener(nullproxy_handler)

urllib2.install_opener(opener)

req = urllib2.Request(url, headers=headers)

response = urllib2.urlopen(req, timeout=3)

r = response.read()

encode=getCoding(r)

if(encode==None):

print(response.info().get(&#39;Content-Encoding&#39;))

#gzip需要解压

else :

r = r.decode(encode)

# print(r)

bsObj = BeautifulSoup(r, &#39;lxml&#39;)

except Exception, err:

raise Exception(err)

# print(bsObj)

contentDiv = bsObj.find(&#39;div&#39;, id=&#39;content&#39;)

content = bsObj.find(&#39;div&#39;, id=&#39;content&#39;).get_text()

preAndNextBar = bsObj.find(&#39;div&#39;, id=&#39;thumb&#39;)

title = bsObj.find(&#39;div&#39;, id=&#39;bgdiv&#39;).h1.get_text()

if ("下一章" in preAndNextBar.get_text()):

next = None

aList = preAndNextBar.findAll(&#39;a&#39;)

for i in aList:

if ("下一章" in i.get_text()):

next = i

if (next == None):

print("下一章为空")

return True

nextHref = next.get(&#39;href&#39;)

print(title)

# print(content)

print(nextHref)

f.write("#####" + &#39;\n&#39;)

f.write(title + &#39;\n&#39;)

f.write(content + &#39;\n&#39;)

count[&#39;num&#39;] = 0

else:

return True

def main():

IPpool()

global page

try:

for num in range(1, page):

if (getContent(nextHref)):

break

print("--- end ---")

except Exception, e:

print(traceback.print_exc())

finally:

f.close()

main()

  附件:代理人采集

  下载地址:

0 个评论

要回复文章请先登录注册


官方客服QQ群

微信人工客服

QQ人工客服


线