vba xmlhttp 抓取网页( 我的代码:PublicSubRoupa() )

优采云 发布时间: 2022-03-16 01:07

  vba xmlhttp 抓取网页(

我的代码:PublicSubRoupa()

)

  Excel VBA 跨多个网页抓取

  excelvbaweb-scraping

  Excel VBA 跨多个网页数据抓取,效果很好。我现在的“问题”是我需要为多个网页运行代码,因为我正在抓取的 网站 有一个分页脚本。一个页面有48条记录,但大多数情况下,页面有200多条记录,但又分为3/4页。我的代码:Public Sub Roupa()Dim data As Object, i As Long, html As HTMLDocument, r As Long, c As Long, item As Object, div As Object

  所以我有以下代码从 网站 中删除数据,它工作正常。

  我现在的“问题”是我需要为多个页面运行代码,因为我正在抓取的 网站 有一个分页脚本。

  一个页面有48条记录,但大多数情况下,页面有200多条记录,但又分为3/4页。

  我的代码:

  Public Sub Roupa()

Dim data As Object, i As Long, html As HTMLDocument, r As Long, c As Long, item As Object, div As Object

Set html = New HTMLDocument ' Tools > References > Microsoft HTML Object Library

With CreateObject("MSXML2.XMLHTTP")

.Open "GET", "https://www.worten.pt/grandes-eletrodomesticos/maquinas-de-roupa/maquinas-de-roupa-ver-todos-marca-BALAY-e-BOSCH-e-SIEMENS?per_page=100", False

.send

html.body.innerHTML = .responseText

End With

Set data = html.getElementsByClassName("w-product__content")

For Each item In data

r = r + 1: c = 1

For Each div In item.getElementsByTagName("div")

With ThisWorkbook.Worksheets("Roupa")

.Cells(r, c) = div.innerText

End With

c = c + 1

Next

Next

Sheets("Roupa").Range("A:A,C:C,F:F,G:G,H:H,I:I").EntireColumn.Delete

End Sub

  Public子组a()

Dim数据作为对象,i作为长,html作为HTMLDocument,r作为长,c作为长,item作为对象,div作为对象

设置html=New HTMLDocument'Tools>References>Microsoft html对象库

使用CreateObject(“MSXML2.XMLHTTP”)

.打开“获取”https://www.worten.pt/grandes-eletrodomesticos/maquinas-de-roupa/maquinas-de-roupa-ver-todos-marca-BALAY-e-BOSCH-e-SIEMENS?per_page=100”“错

邮寄

html.body.innerHTML=.responseText

Set data=html.getElementsByClassName(“w-product\u内容”)

对于数据中的每个项目

r=r+1:c=1

对于item.getElementsByTagName(“div”)中的每个div

使用此工作簿。工作表(“组A”)

.Cells(r,c)=div.innerText

c=c+1

下一个

下一个

表格(“A”)。范围(“A:A,C:C,F:F,G:G,H:H,I:I”)。全部删除

端接头

  更新

  我尝试过使用

  的

  在 n=1 到 2 之前添加这个

  ,但我需要知道确切的页数,这样就没有什么帮助了。通过将结果计数除以每页的结果来计算出有多少页。然后执行一个循环,将适当的页码连接到url上</p>

Option Explicit

Public Sub Roupa()

Dim data As Object, i As Long, html As HTMLDocument, r As Long, c As Long, item As Object, div As Object

Set html = New HTMLDocument ' Tools > References > Microsoft HTML Object Library

Const RESULTS_PER_PAGE As Long = 48

Const START_URL As String = "https://www.worten.pt/grandes-eletrodomesticos/maquinas-de-roupa/maquinas-de-roupa-ver-todos-marca-BALAY-e-BOSCH-e-SIEMENS?per_page=" & RESULTS_PER_PAGE & "&page=1"

With CreateObject("MSXML2.XMLHTTP")

.Open "GET", START_URL, False

.setRequestHeader "User-Agent", "Mozilla/5.0"

.send

html.body.innerHTML = .responseText

Dim numPages As Long, numResults As Long, arr() As String

arr = Split(html.querySelector(".w-filters__element").innerText, Chr$(32))

numResults = arr(UBound(arr))

numPages = 1

If numResults > RESULTS_PER_PAGE Then

numPages = Application.RoundUp(numResults / RESULTS_PER_PAGE, 0)

End If

For i = 1 To numPages

If i > 1 Then

.Open "GET", Replace$("https://www.worten.pt/grandes-eletrodomesticos/maquinas-de-roupa/maquinas-de-roupa-ver-todos-marca-BALAY-e-BOSCH-e-SIEMENS?per_page=" & RESULTS_PER_PAGE & "&page=1", "page=1", "page=" & i), False

.setRequestHeader "User-Agent", "Mozilla/5.0"

.send

html.body.innerHTML = .responseText

End If

Set data = html.getElementsByClassName("w-product__content")

For Each item In data

r = r + 1: c = 1

For Each div In item.getElementsByTagName("div")

With ThisWorkbook.Worksheets("Roupa")

.Cells(r, c) = div.innerText

End With

c = c + 1

Next

Next

Next

End With

Sheets("Roupa").Range("A:A,C:C,F:F,G:G,H:H,I:I").EntireColumn.Delete

End Sub

</p>

  如果您尝试在 URL 中更改它-

  每页=100

  说

  每页=100000

  ?我试过了,实际上页面只加载了48条记录

  每页=100

  @QHarr 已经提供了我的尝试

  w-filters\uu元素

  导出页数的好方法。但是,另一种方法(我通常使用的)是循环(增加页数)直到分页之后

  列表元素的

  在内部文本中查找下一个页码(在这种情况下,

  分页文本中心

  或 Div 类

  w-pagination-block

  ) 该死的你是个天才! !你有好的课程吗?我想了解更多! !和

  numPages = html.querySelectorAll("[data-page]").Length

0 个评论

要回复文章请先登录注册


官方客服QQ群

微信人工客服

QQ人工客服


线