关键词采集词(5.Mysql避免数据重复插入的方法有哪些？-八维教育)

优采云发布时间: 2021-11-19 18:11

　　本文知识点：

　　1、修改了类的编写。

　　2、回顾了scrapy 的用法，先获取大范围的数字，然后for 循环获取每个项目。取大范围取值时，python中打印是没有问题的。比如本文中的node_list，可以为迅华取出。

　　3、学习了写入Mysql时自动生成时间的方法。

　　4、学习了如何在 xpaht 中编写“.//”和“./”。

　　5. Mysql 避免重复插入数据的方法。

　　一、常规

　　折腾了一个上午，原来是登陆页面的问题：

　　网页上的整体索引、PC 索引、移动索引列数据在网页的源代码上是不同的。这样一来，有的数据可以通过常规规则获得，有的则无法获得！

　　结果是这样的：

　　二、别人推荐的关于python的视频

　　三、代码

　　过了几天，学了一点规律，复习了scrapy，终于搞定了：

<p>

import requests

import re

import MySQLdb

from lxml import etree

import time

import datetime

#函数要调用必须先定义，而且顺序必须在前

class KeywordsTool():

"""一个采集chinaz关键词指数的类"""

def __init__(self, url):

# init进行初始化参数，把我们需要从外部传入的参数，作为类的属性

self.url = url

self.conn=MySQLdb.connect(host="localhost",user="root",passwd='',db="testpy" ,port=3306,charset="utf8") #连接数据库

self.cursor=self.conn.cursor() #定位一个指针

def get_url_content(self,url, max_try_number=5):

#封装的requests.get

try_num = 0

while True:

try:

return requests.get(url, timeout=5)

except Exception as http_err:

print(url, "抓取报错", http_err)

try_num += 1

if try_num >= max_try_number:

print("尝试失败次数过多，放弃尝试")

return None

def get_total_page(self,content):

#获取关键词总页数

selector = etree.HTML(content.text)

total_page_num = selector.xpath("//span[@class='col-gray02']/text()") # 获取总页数

total_page_num=re.sub(r'\D','',total_page_num[0])

return int(total_page_num)

def insert_to_db(self,word,total_index,pc_index,m_index):

#将数据插入Mysql数据库

dt=datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")

#insert ignore into 是避免重复插入

insert_result = self.cursor.execute("insert ignore into keywords(word,total_index,pc_index,m_index,create_time) VALUES ('%s',%s,%s,%s,str_to_date(\'%s\','%%Y-%%m-%%d %%H:%%i:%%s'))"%(word,total_index,pc_index,m_index,dt))

print("insert {}".format(insert_result))

print('[关键词：',word,'-----------已成功添加到数据库---------------')

# time.sleep(10)

self.conn.commit()

def get_keywords(self,i):

#获取网站上的关键词信息

url ="http://rank.chinaz.com/?host={}&st=0&c=&sortType=0&page={}".format(self.url,i)

content=requests.get(url).text

selector = etree.HTML(content)

node_list = selector.xpath("//li[@class='ReListCent ReLists clearfix']")

for node in node_list:

keywords = node.xpath("./div[@class='w25-0 tl pl10 pr pbimg showt']/a/text()")[0]

index_list = node.xpath("./div[@class='w8-0']/a/text()")

if len(index_list) ==0:

index_total = node.xpath(".//span[@title='整体搜索量']/a/text()")[0]

index_pc = node.xpath(".//span[@title='PC搜索量']/a/text()")[0]

index_m = node.xpath(".//span[@title='移动搜索量']/a/text()")[0]

else:

index_total = index_list[0]

index_pc = index_list[1]

index_m = index_list[2]

print('采集关键词：',keywords,',整体指数：',index_total,',PC指数：',index_pc,',移动指数：',index_m)

self.insert_to_db(keywords,index_total,index_pc,index_m)

print("已经完成第{}页数据的抓取".format(i))

print("*"*80)

time.sleep(2)

# ===抓取多页数据

def get_all_content(self):

url = "http://rank.chinaz.com/?host={}&st=0&c=&sortType=0&page=1".format(self.url)

r = self.get_url_content(url)

total_page_num = self.get_total_page(r) #这里调用函数self一定不能少

print("关键词总页数为{}页".format(total_page_num))

if total_page_num = max_try_number:

print("尝试失败次数过多，放弃尝试")

return None

def get_total_page(self,content):

#获取关键词总页数

reg = r'共(\d+)页'

total_page_num = re.findall(reg,content)[0]

return int(total_page_num)

def insert_to_db(self,word):

filename = self.keyword+".txt"

with open(filename,'a') as f:

f.write(word+'\n')

def get_keywords(self,i):

#获取网站上的关键词信息

url = "https://data.chinaz.com/keyword/allindex/{}/{}".format(self.keyword,i)

content=requests.get(url).text

selector = etree.HTML(content)

node_list = selector.xpath("//li[@class='col-224 nofoldtxt']/a/text()")

for keyword in node_list:

self.insert_to_db(keyword)

print(keyword)

print("已经完成第{}页数据的抓取".format(i))

print("*"*80)

time.sleep(2)

# ===抓取多页数据

def get_all_content(self):

# url = "https://data.chinaz.com/keyword/analysis/{}".format(self.keyword)

url = "https://data.chinaz.com/keyword/allindex/{}/1".format(self.keyword)

r = self.get_url_content(url)

total_page_num = self.get_total_page(r) #这里调用函数self一定不能少

print("关键词总页数为{}页".format(total_page_num))

if total_page_num

0

2021-11-19

关键词采集词

0 个评论

要回复文章请先登录或注册

AI时代内容工厂

关键词采集词(5.Mysql避免数据重复插入的方法有哪些？-八维教育)

0 个评论

发起人

AI时代内容工厂

关键词采集词(5.Mysql避免数据重复插入的方法有哪些？-八维教育)

0 个评论

发起人

相关问题