XHTML Website Scraping Guidance

2019-07-05 22:08发布

I'm very new to VBA and HTML/XHTML, but through online research and help from other wonderful members on here I've managed to write a code to pull the data I want. I had a hard time identifying the IDs of the elements I want since it's in XHTML, so I think that's where I've botched it the most.

The website: http://www.usbanklocations.com/banks.php?q=&ct=&ml=30&lc=

Here is what I want the code to do: Pull Bank Name, Address, Phone Number, Total Deposits and Total Assets -- GIVEN the bank name and city I provide in my excel sheet.

Here is my code:

Private Declare Sub Sleep Lib "kernel32.dll" (ByVal dwMilliseconds As Long)
Sub CommunityBanks()
    Dim IE As Object, TableResults As Object, webRow As Object, BankName As Variant, page As Long, pageTotal As Long, r As Long
    Dim beginTime As Date, i As Long, myvalue As Variant

Set IE = CreateObject("internetexplorer.application")
IE.navigate "http://www.usbanklocations.com/banks.php?name=" & Range("A2").Value & "+Bank&ml=30&lc=" & Range("B2").Value & "%2C+TX"
IE.Visible = True

Do While IE.Busy Or IE.readystate <> 4   '4 = READYSTATE_COMPLETE
    DoEvents
Loop

'input bank name into form
'myvalue = InputBox("Enter City. Press okay to begin search", "Bank Search")
'Range("F3").Value = myvalue
'IE.document.getelementbyid("MainContent_txtCity").Value = "LegacyTexas"
'click find button
'IE.document.getelementbyid("MainContent_btn").Click
'Sleep 5 * 1000
IE.document.getelementbytagname("table").getelementsbyclassname("btn").Click
Sleep 5 * 1000

'total pages
pageTotal = IE.document.getelementbyid("lsortby").innertext
page = 0

Do Until page = pageTotal
    DoEvents
    page = IE.document.getelementbyclassname("lsortby").innertext
    With IE.document.getelementbyid("main")
        For r = 1 To .Rows.Length - 1
            If Not IsArray(BankName) Then
                ReDim BankName(7, 0) As Variant
            Else
                ReDim Preserve BankName(7, UBound(BankName, 2) + 1) As Variant
            End If

            BankName(0, UBound(BankName, 2)) = .Rows(r).Cells(0).innertext
        Next r
    End With

    If page < pageTotal Then
        IE.document.getelementbyclassname("panelpn").Click
        beginTime = Now
        Application.Wait (Now + TimeValue("00:00:05"))
    End If
Loop

For r = 0 To UBound(BankName, 2)
    IE.navigate "http://www.usbanklocations.com/" & BankName(0, r)
    Do While IE.Busy Or IE.readystate <> 4   '4 = READYSTATE_COMPLETE
        DoEvents
    Loop
    'wait 5 sec. for screen refresh
    Sleep 5 * 1000

    With IE.document.getelementbytagname("table")
        For i = 0 To .Rows.Length - 1
            DoEvents
            Select Case .Rows(i).Cells(0).innertext
            Case "Name:"
                BankName(1, r) = .Rows(i).Cells(1).innertext
            Case "Location:"
                BankName(2, r) = .Rows(i).Cells(1).innertext
            Case "Phone:"
                BankName(3, r) = .Rows(i).Cells(1).innertext
            Case "Branch Deposit:"
                BankName(4, r) = Replace(Replace(.Rows(i).Cells(1).innertext, ",", ""), "$", "")
            Case "Total Assets:"
                BankName(5, r) = Replace(Replace(.Rows(i).Cells(1).innertext, ",", ""), "$", "")
            End Select
        Next i
    End With
Next r


IE.Quit
Set IE = Nothing

'post result on Excel cell
Worksheets(1).Range("A9").Resize(UBound(BankName, 2) + 1, UBound(BankName, 1) + 1).Value = Application.Transpose(BankName)
End Sub

Thank you in advance! I would greatly appreciate any help.

1条回答
小情绪 Triste *
2楼-- · 2019-07-05 22:21

Consider the below example which uses XHR instead of IE and split-based HTML content parsing:

Option Explicit

Sub Test_usbanklocations()

    Dim oSource, oDestination, y, oSrcRow, sName, sCity, sDist, sUrl0, sUrl1, sUrl2, lPage, sResp1, sResp2, i, a1, a2, a3, a4, a5

    Set oSource = Sheets(1)
    Set oDestination = Sheets(2)
    oDestination.Cells.Delete
    DataOutput oDestination, 1, Array("Name", "Location", "Phone", "Total Assets", "Total Deposits")
    y = 2

    For Each oSrcRow In oSource.UsedRange.Rows
        sName = oSrcRow.Cells(1, 1).Value
        sCity = oSrcRow.Cells(1, 2).Value
        sDist = oSrcRow.Cells(1, 3).Value
        sUrl0 = "http://www.usbanklocations.com/banks.php?q=" & EncodeUriComponent(sName) & "&lc=" & EncodeUriComponent(sCity) & "&ml=" & sDist
        sUrl1 = sUrl0
        lPage = 1
        Do
            sResp1 = GetXHR(sUrl1)
            If InStr(sResp1, "We can not find the address you provided. Please check.") > 0 Then Exit Do
            a1 = Split(sResp1, "<div class=""pl")
            For i = 1 To UBound(a1)
                a2 = Split(a1(i), "</div>", 3)
                a3 = Split(a2(1), "<a href=""", 2)
                a4 = Split(a3(1), """>", 2)
                sUrl2 = "http://www.usbanklocations.com" & a4(0)
                sResp2 = GetXHR(sUrl2)
                a5 = Array( _
                    GetFragment(sResp2, "<b>Name:</b></td><td>", "</td>"), _
                    Replace(GetFragment(sResp2, "<b>Location:</b></td><td>", "</td>"), "View Other Branches", ""), _
                    GetFragment(sResp2, "<b>Phone:</b></td>", "</td>"), _
                    GetFragment(sResp2, "<b>Total Assets:</b></td><td>", "</td>"), _
                    GetFragment(sResp2, "<b>Total Deposits:</b></td><td>", "</td>") _
                )
                DataOutput oDestination, y, a5
                y = y + 1
                DoEvents
            Next
            If InStr(sResp1, "Next Page &gt;") = 0 Then Exit Do
            lPage = lPage + 1
            sUrl1 = sUrl0 & "&ps=" & lPage
            DoEvents
        Loop
    Next

    MsgBox "Completed"

End Sub

Function GetXHR(sUrl)

    With CreateObject("MSXML2.XMLHTTP")
        .Open "GET", sUrl, False
        .Send
        GetXHR = .ResponseText
    End With

End Function

Sub DataOutput(oSht, y, aValues)

    With oSht.Cells(y, 1).Resize(1, UBound(aValues) + 1)
        .NumberFormat = "@"
        .Value = aValues
    End With

End Sub

Function GetFragment(sText, sPatt1, sPatt2)

    Dim a1, a2

    a1 = Split(sText, sPatt1, 2)
    If UBound(a1) <> 1 Then Exit Function
    a2 = Split(a1(1), sPatt2, 2)
    If UBound(a2) <> 1 Then Exit Function
    GetFragment = GetInnerText(a2(0))

End Function

Function EncodeUriComponent(sText)

    Static objHtmlfile As Object

    If objHtmlfile Is Nothing Then
        Set objHtmlfile = CreateObject("htmlfile")
        objHtmlfile.parentWindow.execScript "function encode(s) {return encodeURIComponent(s)}", "jscript"
    End If
    EncodeUriComponent = objHtmlfile.parentWindow.encode(sText)

End Function

Function GetInnerText(sText)

    With CreateObject("htmlfile")
        .Write ("<body>" & sText & "</body>")
        GetInnerText = .DocumentElement.Document.GetElementsByTagName("body")(0).InnerText
    End With

End Function

As an example, the first worksheet contains data to search (Bank name, Location and Distance to refine by):

source

Then result on the second worksheet is as follows:

result

查看更多
登录 后发表回答