淘宝头条搜集爬虫技术共享(支持一键发布)
优采云 发布时间: 2020-08-06 01:06许多从事电子商务的朋友肯定需要主要电子商务的最新信息. 有很多共享专家的文章可以引爆流量. 遗憾的是没有使用这么好的文章,那么我们如何实时采集最新信息呢?每个网站管理员的梦想都是发布专家的文章,然后一键发布到自己的网站. 这是每个网站管理员的梦想. 今天,我将分享淘宝标题文章集的爬虫技术.
淘宝网的头条新闻很难采集. 通用的基于浏览器引擎的采集器受到js跳转的限制,无法稳定地采集. 基于http / https模拟请求的采集器将面临更大的挑战. 淘宝标题请求均已签名并加密,必须分析其加密算法以模拟正常请求. 采集完数据后,g兄弟将分析如何通过模拟请求捕获淘宝标题文章数据.
准备工作
1. 分析请求并解密加密算法
2,制作一个爬虫
3,测试集
分析请求
首先分析请求,然后发现标题请求的网址如下
%7B%22columnId%22%3A%2246%22%2C%22publishId%22%3A%2266957211%22%7D
其中有几个关键参数,分别是data和sign,data是data参数的JSON格式,sign是验证字段,并且sign需要分析其加密算法.
通过g兄弟的努力,对其加密的主要代码进行了如下分析:
函数a9(b,a){
返回b >> 32-a
}
函数bf(g,d){
var j,c,f,b,h;
返回f = 2147483648&g,
b = 2147483648&d,
j = 1073741824&g,
c = 1073741824&d,
h =(1073741823&g)+(1073741823&d),
j&c? 2147483648 ^ h ^ f ^ b: j | C? 1073741824&小时? 3221225472 ^ h ^ f ^ b: 1073741824 ^ h ^ f ^ b: h ^ f ^ b
}
函数bk(b,a,c){
返回b&a | 〜b&c
}
函数bb(b,a,c){
返回b和c | &&c
}
函数bs(b,a,c){
返回b ^ a ^ c
}
函数be(b,a,c){
返回a ^(b |〜c)
}
函数ba(h,g,b,i,f,j,d){
返回h = bf(h,bf(bf(bk(g,b,i),f),d)),
bf(a9(h,j),g)
}
函数bq(h,f,b,j,g,k,d){
返回h = bf(h,bf(bf(bb(f,b,j),g),d)),
bf(a9(h,k),f)
}
函数a8(g,b,f,h,d,j,a){
返回g = bf(g,bf(bf(bs(b,f,h),d),a)),
bf(a9(g,j),b)
}
函数bh(j,f,h,b,g,k,d){
返回j = bf(j,bf(bf(be(f,h,b),g),d)),
bf(a9(j,k),f)
}
功能bp(h){
对于(var l,f = h.length,g = f + 8,b =(gg%64)/ 64,k = 16 *(b + 1),d = new Array(k-1), m = 0,j = 0; f> j;){
l =(j-j%4)/ 4,
m = j%4 * 8,
d [l] = d [l] | h.charCodeAt(j)
j ++
}
返回l =(j-j%4)/ 4,
m = j%4 * 8,
d [l] = d [l] | 128
d [k-2] = f
d [k-1] = f >>> 29,
d
}
函数bd(d){
var b,f,a =“”,c =“”;
for(f = 0; 3> = f; f ++){
b = d >>> 8 * f和255,
c =“ 0” + b.toString(16),
a + = c.substr(c.length-2,2)
}
返回
}
函数bn(c){
c = c.replace(/ \ r \ n / g,“ \ n”);
for(var b =“”,d = 0; d
var a = c.charCodeAt(d);
128>一个? b + = String.fromCharCode(a): a> 127 && 2048> a? (b + = String.fromCharCode(a >> 6 | 192),
b + = String.fromCharCode(63&a | 128)): (b + = String.fromCharCode(a >> 12 | 224),
b + = String.fromCharCode(a >> 6&63 | 128),
b + = String.fromCharCode(63&a | 128))
}
返回b
}
然后,兄弟g编写了一个编码函数来加密自定义的请求字符串,代码如下:
函数encode(bo){
var bl,bg,a7,bm,a6,br,a4,a3,G,J = [],bt = 7,aZ = 12,Q = 17,X = 22,bc = 5,a1 = 9 ,F = 14,a5 = 20,L = 4,U = 11,z = 16,bj = 23,bi = 6,a0 = 10,Y = 15,a2 = 21;
对于(bo = bn(bo),
J = bp(bo),
br = 1732584193,
a4 = 4023233417,
a3 = 2562383102,
G = 271733878,
bl = 0; bl
bg = br,
a7 = a4,
bm = a3,
a6 = G,
br = ba(br,a4,a3,G,J [bl + 0],bt,3614090360),
G = ba(G,br,a4,a3,J [bl + 1],aZ,3905402710),
a3 = ba(a3,G,br,a4,J [bl + 2],Q,606105819),
a4 = ba(a4,a3,G,br,J [bl + 3],X,3250441966),
br = ba(br,a4,a3,G,J [bl + 4],bt,4118548399),
G = ba(G,br,a4,a3,J [bl + 5],aZ,1200080426),
a3 = ba(a3,G,br,a4,J [bl + 6],Q,2821735955),
a4 = ba(a4,a3,G,br,J [bl + 7],X,4249261313),
br = ba(br,a4,a3,G,J [bl + 8],bt,1770035416),
G = ba(G,br,a4,a3,J [bl + 9],aZ,2336552879),
a3 = ba(a3,G,br,a4,J [bl + 10],Q,4294925233),
a4 = ba(a4,a3,G,br,J [bl + 11],X,2304563134),
br = ba(br,a4,a3,G,J [bl + 12],bt,1804603682),
G = ba(G,br,a4,a3,J [bl + 13],aZ,4254626195),
a3 = ba(a3,G,br,a4,J [bl + 14],Q,2792965006),
a4 = ba(a4,a3,G,br,J [bl + 15],X,1236535329),
br = bq(br,a4,a3,G,J [bl + 1],bc,4129170786),
G = bq(G,br,a4,a3,J [bl + 6],a1,3225465664),
a3 = bq(a3,G,br,a4,J [bl + 11],F,643717713),
a4 = bq(a4,a3,G,br,J [bl + 0],a5,3921069994),
br = bq(br,a4,a3,G,J [bl + 5],bc,3593408605),
G = bq(G,br,a4,a3,J [bl + 10],a1,38016083),
a3 = bq(a3,G,br,a4,J [bl + 15],F,3634488961),
a4 = bq(a4,a3,G,br,J [bl + 4],a5,3889429448),
br = bq(br,a4,a3,G,J [bl + 9],bc,568446438),
G = bq(G,br,a4,a3,J [bl + 14],a1,3275163606),
a3 = bq(a3,G,br,a4,J [bl + 3],F,4107603335),
a4 = bq(a4,a3,G,br,J [bl + 8],a5,1163531501),
br = bq(br,a4,a3,G,J [bl + 13],bc,2850285829),
G = bq(G,br,a4,a3,J [bl + 2],a1,4243563512),
a3 = bq(a3,G,br,a4,J [bl + 7],F,1735328473),
a4 = bq(a4,a3,G,br,J [bl + 12],a5,2368359562),
br = a8(br,a4,a3,G,J [bl + 5],L,4294588738),
G = a8(G,br,a4,a3,J [bl + 8],U,2272392833),
a3 = a8(a3,G,br,a4,J [bl + 11],z,1839030562),
a4 = a8(a4,a3,G,br,J [bl + 14],bj,4259657740),
br = a8(br,a4,a3,G,J [bl + 1],L,2763975236),
G = a8(G,br,a4,a3,J [bl + 4],U,1272893353),
a3 = a8(a3,G,br,a4,J [bl + 7],z,4139469664),
a4 = a8(a4,a3,G,br,J [bl + 10],bj,3200236656),
br = a8(br,a4,a3,G,J [bl + 13],L,681279174),
G = a8(G,br,a4,a3,J [bl + 0],U,3936430074),
a3 = a8(a3,G,br,a4,J [bl + 3],z,3572445317),
a4 = a8(a4,a3,G,br,J [bl + 6],bj,76029189),
br = a8(br,a4,a3,G,J [bl + 9],L,3654602809),
G = a8(G,br,a4,a3,J [bl + 12],U,3873151461),
a3 = a8(a3,G,br,a4,J [bl + 15],z,530742520),
a4 = a8(a4,a3,G,br,J [bl + 2],bj,3299628645),
br = bh(br,a4,a3,G,J [bl + 0],bi,4096336452),
G = bh(G,br,a4,a3,J [bl + 7],a0,1126891415),
a3 = bh(a3,G,br,a4,J [bl + 14],Y,2878612391),
a4 = bh(a4,a3,G,br,J [bl + 5],a2,4237533241),
br = bh(br,a4,a3,G,J [bl + 12],bi,1700485571),
G = bh(G,br,a4,a3,J [bl + 3],a0,2399980690),
a3 = bh(a3,G,br,a4,J [bl + 10],Y,4293915773),
a4 = bh(a4,a3,G,br,J [bl + 1],a2,2240044497),
br = bh(br,a4,a3,G,J [bl + 8],bi,1873313359),
G = bh(G,br,a4,a3,J [bl + 15],a0,4264355552),
a3 = bh(a3,G,br,a4,J [bl + 6],Y,2734768916),
a4 = bh(a4,a3,G,br,J [bl + 13],a2,1309151649),
br = bh(br,a4,a3,G,J [bl + 4],bi,4149444226),
G = bh(G,br,a4,a3,J [bl + 11],a0,3174756917),
a3 = bh(a3,G,br,a4,J [bl + 2],Y,718787259),
a4 = bh(a4,a3,G,br,J [bl + 9],a2,3951481745),
br = bf(br,bg),
a4 = bf(a4,a7),
a3 = bf(a3,bm),
G = bf(G,a6)
}
var V = bd(br)+ bd(a4)+ bd(a3)+ bd(G);
返回V.toLowerCase()
}
制作爬虫
测试集合
采集测试结果如下:
作者: gbkhero
链接:
打开应用并阅读说明