vba xmlhttp 抓取网页( 我的代码:PublicSubRoupa() )
优采云 发布时间: 2022-03-16 01:07vba xmlhttp 抓取网页(
我的代码:PublicSubRoupa()
)
Excel VBA 跨多个网页抓取
excelvbaweb-scraping
Excel VBA 跨多个网页数据抓取,效果很好。我现在的“问题”是我需要为多个网页运行代码,因为我正在抓取的 网站 有一个分页脚本。一个页面有48条记录,但大多数情况下,页面有200多条记录,但又分为3/4页。我的代码:Public Sub Roupa()Dim data As Object, i As Long, html As HTMLDocument, r As Long, c As Long, item As Object, div As Object
所以我有以下代码从 网站 中删除数据,它工作正常。
我现在的“问题”是我需要为多个页面运行代码,因为我正在抓取的 网站 有一个分页脚本。
一个页面有48条记录,但大多数情况下,页面有200多条记录,但又分为3/4页。
我的代码:
Public Sub Roupa()
Dim data As Object, i As Long, html As HTMLDocument, r As Long, c As Long, item As Object, div As Object
Set html = New HTMLDocument ' Tools > References > Microsoft HTML Object Library
With CreateObject("MSXML2.XMLHTTP")
.Open "GET", "https://www.worten.pt/grandes-eletrodomesticos/maquinas-de-roupa/maquinas-de-roupa-ver-todos-marca-BALAY-e-BOSCH-e-SIEMENS?per_page=100", False
.send
html.body.innerHTML = .responseText
End With
Set data = html.getElementsByClassName("w-product__content")
For Each item In data
r = r + 1: c = 1
For Each div In item.getElementsByTagName("div")
With ThisWorkbook.Worksheets("Roupa")
.Cells(r, c) = div.innerText
End With
c = c + 1
Next
Next
Sheets("Roupa").Range("A:A,C:C,F:F,G:G,H:H,I:I").EntireColumn.Delete
End Sub
Public子组a()
Dim数据作为对象,i作为长,html作为HTMLDocument,r作为长,c作为长,item作为对象,div作为对象
设置html=New HTMLDocument'Tools>References>Microsoft html对象库
使用CreateObject(“MSXML2.XMLHTTP”)
.打开“获取”https://www.worten.pt/grandes-eletrodomesticos/maquinas-de-roupa/maquinas-de-roupa-ver-todos-marca-BALAY-e-BOSCH-e-SIEMENS?per_page=100”“错
邮寄
html.body.innerHTML=.responseText
以
Set data=html.getElementsByClassName(“w-product\u内容”)
对于数据中的每个项目
r=r+1:c=1
对于item.getElementsByTagName(“div”)中的每个div
使用此工作簿。工作表(“组A”)
.Cells(r,c)=div.innerText
以
c=c+1
下一个
下一个
表格(“A”)。范围(“A:A,C:C,F:F,G:G,H:H,I:I”)。全部删除
端接头
更新
我尝试过使用
的
在 n=1 到 2 之前添加这个
,但我需要知道确切的页数,这样就没有什么帮助了。通过将结果计数除以每页的结果来计算出有多少页。然后执行一个循环,将适当的页码连接到url上</p>
Option Explicit
Public Sub Roupa()
Dim data As Object, i As Long, html As HTMLDocument, r As Long, c As Long, item As Object, div As Object
Set html = New HTMLDocument ' Tools > References > Microsoft HTML Object Library
Const RESULTS_PER_PAGE As Long = 48
Const START_URL As String = "https://www.worten.pt/grandes-eletrodomesticos/maquinas-de-roupa/maquinas-de-roupa-ver-todos-marca-BALAY-e-BOSCH-e-SIEMENS?per_page=" & RESULTS_PER_PAGE & "&page=1"
With CreateObject("MSXML2.XMLHTTP")
.Open "GET", START_URL, False
.setRequestHeader "User-Agent", "Mozilla/5.0"
.send
html.body.innerHTML = .responseText
Dim numPages As Long, numResults As Long, arr() As String
arr = Split(html.querySelector(".w-filters__element").innerText, Chr$(32))
numResults = arr(UBound(arr))
numPages = 1
If numResults > RESULTS_PER_PAGE Then
numPages = Application.RoundUp(numResults / RESULTS_PER_PAGE, 0)
End If
For i = 1 To numPages
If i > 1 Then
.Open "GET", Replace$("https://www.worten.pt/grandes-eletrodomesticos/maquinas-de-roupa/maquinas-de-roupa-ver-todos-marca-BALAY-e-BOSCH-e-SIEMENS?per_page=" & RESULTS_PER_PAGE & "&page=1", "page=1", "page=" & i), False
.setRequestHeader "User-Agent", "Mozilla/5.0"
.send
html.body.innerHTML = .responseText
End If
Set data = html.getElementsByClassName("w-product__content")
For Each item In data
r = r + 1: c = 1
For Each div In item.getElementsByTagName("div")
With ThisWorkbook.Worksheets("Roupa")
.Cells(r, c) = div.innerText
End With
c = c + 1
Next
Next
Next
End With
Sheets("Roupa").Range("A:A,C:C,F:F,G:G,H:H,I:I").EntireColumn.Delete
End Sub
</p>
如果您尝试在 URL 中更改它-
每页=100
说
每页=100000
?我试过了,实际上页面只加载了48条记录
每页=100
@QHarr 已经提供了我的尝试
w-filters\uu元素
导出页数的好方法。但是,另一种方法(我通常使用的)是循环(增加页数)直到分页之后
列表元素的
在内部文本中查找下一个页码(在这种情况下,
分页文本中心
或 Div 类
w-pagination-block
) 该死的你是个天才! !你有好的课程吗?我想了解更多! !和
numPages = html.querySelectorAll("[data-page]").Length