Web Scraping by Elements

2019-09-24 12:38发布

问题:

I'm trying to scrape the data from a web-page. It won't work for a website for which all the format is same like as class, tag everything. I am getting an error is "Subscript out of range" and it's highlighting on "ReDim results(1 To rowCount, 1 To numColumns)" code.

I got an answer on page: Web Scraping by TagName the code works fine for https://www.neighborhoodselfstorage.net/self-storage-ocean-city-md-88769

Now I am trying to use the same code for: https://www.stormore.net/self-storage-seattle-wa-101616#utm_source=GoogleLocal&utm_medium=WRLocal&utm_campaign=101616

Please anybody help to solve this problem.

Option Explicit  
Public Sub GetInfo()
Dim ws As Worksheet, html As HTMLDocument, s As String
Const URL As String = "https://www.stormore.net/self-storage-seattle-wa-101616#utm_source=GoogleLocal&utm_medium=WRLocal&utm_campaign=101616"

Set ws = ThisWorkbook.Worksheets("Sheet1")
Set html = New HTMLDocument
With CreateObject("MSXML2.XMLHTTP")
    .Open "GET", URL, False
    .setRequestHeader "User-Agent", "Mozilla/5.0"
    .send
    s = .responseText
    html.body.innerHTML = s

    Dim headers(), results(), listings As Object, amenities As String

    headers = Array("Size", "Description", "Amenities", "Offer1", "Offer2", "RateType", "Price")
    Set listings = html.querySelectorAll(".main li[class]")

    Dim rowCount As Long, numColumns As Long, r As Long, c As Long
    Dim icons As Object, icon As Long, amenitiesInfo(), i As Long, item As Long

    rowCount = listings.Length
    numColumns = UBound(headers) + 1

    ReDim results(1 To rowCount, 1 To numColumns)
    Dim html2 As HTMLDocument
    Set html2 = New HTMLDocument
    For item = 0 To listings.Length - 1
        r = r + 1
        html2.body.innerHTML = listings.item(item).innerHTML
        'size,description, amenities,specials offer1 offer2, rate type, price

        results(r, 1) = Trim$(html2.querySelector(".size").innerText)
        results(r, 2) = Trim$(html.querySelector(".description").innerText)
        Set icons = html2.querySelectorAll("i[title]")

        ReDim amenitiesInfo(0 To icons.Length - 1)

        For icon = 0 To icons.Length - 1
            amenitiesInfo(icon) = icons.item(icon).getAttribute("title")
        Next

        amenities = Join$(amenitiesInfo, ", ")

        results(r, 3) = amenities
        results(r, 4) = html2.querySelector(".offer1").innerText
        results(r, 5) = html2.querySelector(".offer2").innerText
        results(r, 6) = html2.querySelector(".rate-label").innerText
        results(r, 7) = html2.querySelector(".price").innerText
    Next

    ws.Cells(1, 1).Resize(1, UBound(headers) + 1) = headers
    ws.Cells(2, 1).Resize(UBound(results, 1), UBound(results, 2)) = results
End With
End Sub

回答1:

I think you want something like the following.

Initial error:

Your initial error in part, I think, is due to the url not returning the html you see when using the same url in the browser. The content I saw did not contain these listings in the response, hence row count was 0 ; therefore, your error subscript out of range error on this line: ReDim results(1 To rowCount, 1 To numColumns)

So, url changed to:

https://www.stormore.net/self-storage-seattle-wa-101616

Next:

Inspecting the html, to find out how to generate the listings rows, we notice that listings are represented cleanly by .main li.pure-g. An additional class needs adding to the li to filter out unwanted info. We want to loop only rows containing the info of interest.

Set listings = html.querySelectorAll(".main li.pure-g")

Finally:

Whilst inspecting the html we notice that not all rows have all items of interest e.g. offer1 and offer2, so we wrap the attempts to access some items within an On Error Resume Next, On Error GoTo 0 to mask the error and output "" in that column of the output.


VBA:

Option Explicit

Public Sub GetInfo()
    Dim ws As Worksheet, html As HTMLDocument, s As String
    Const URL As String = "https://www.stormore.net/self-storage-seattle-wa-101616"

    Set ws = ThisWorkbook.Worksheets("Sheet1")
    Set html = New HTMLDocument
    With CreateObject("MSXML2.XMLHTTP")
        .Open "GET", URL, False
        .setRequestHeader "User-Agent", "Mozilla/5.0"
        .send
        s = .responseText

        html.body.innerHTML = s

        Dim headers(), results(), listings As Object, amenities As String

        headers = Array("Size", "Description", "Amenities", "Offer1", "Offer2", "RateType", "Price")
        Set listings = html.querySelectorAll(".main li.pure-g")

        Dim rowCount As Long, numColumns As Long, r As Long, c As Long
        Dim icons As Object, icon As Long, amenitiesInfo(), i As Long, item As Long

        rowCount = listings.Length
        numColumns = UBound(headers) + 1

        ReDim results(1 To rowCount, 1 To numColumns)
        Dim html2 As HTMLDocument
        Set html2 = New HTMLDocument
        For item = 0 To listings.Length - 1
            r = r + 1
            html2.body.innerHTML = listings.item(item).innerHTML
            'size,description, amenities,specials offer1 offer2, rate type, price

            results(r, 1) = Trim$(html2.querySelector(".size").innerText)
            results(r, 2) = Trim$(html2.querySelector(".description").innerText)
            On Error Resume Next
            Set icons = html2.querySelectorAll("i[title]")
            ReDim amenitiesInfo(0 To icons.Length - 1)

            For icon = 0 To icons.Length - 1
                amenitiesInfo(icon) = icons.item(icon).getAttribute("title")
            Next

            amenities = Join$(amenitiesInfo, ", ")

            results(r, 3) = amenities

            results(r, 4) = html2.querySelector(".offer1").innerText
            results(r, 5) = html2.querySelector(".offer2").innerText
            On Error GoTo 0
            results(r, 6) = html2.querySelector(".rate-label").innerText
            results(r, 7) = html2.querySelector(".price").innerText
        Next

        ws.Cells(1, 1).Resize(1, UBound(headers) + 1) = headers
        ws.Cells(2, 1).Resize(UBound(results, 1), UBound(results, 2)) = results
    End With
End Sub