Excel VBA web scraping for data table

2019-01-27 11:39发布

问题:

I am trying to get data from Roster Resource, here's an example of a webpage (https://www.rosterresource.com/mlb-arizona-diamondbacks). At the very minimum, I want to get the "Projected "Go-to" Starting Lineup" and import that data into my spreadsheet. I would then do this for every MLB team from Roster Resource to create a sheet that has every team and the projected lineup for each team.

I have tried some methods of "getElementById" and "getElementsByClassName", but I'm having difficulty getting to the data I want since this seems to be just one very large table on the webpage.

Any insight to get me on the right direction of getting the data would be very helpful.

回答1:

Matt: You can use PowerQuery (in either Excel or PowerBI) to do this...even if the data isn't stored in an HTML table (which is the case here). There is a very good tutorial at https://datachant.com/2017/03/30/web-scraping-power-bi-excel-power-query/

I'm currently in the middle of my own web scraping challenge, but if you decide to use PowerQuery and get stuck, yell out and I'll see if I can assist further.



回答2:

If you navigate the webpage https://www.rosterresource.com/mlb-arizona-diamondbacks and choose Inspect element from context menu on the table, you will see in browser developer tools that the whole table is located within a frame:

<iframe id="pageswitcher-content" frameborder="0" marginheight="0" marginwidth="0" src="https://docs.google.com/spreadsheets/d/e/2PACX-1vSe6YBd7UW_ijhVHdRsM132Z3aUXUIzGuHcoqqdsr_nUXIYHbvRDFY0XCwGVndXJnWRaWVYhbeDbo5W/pubhtml/sheet?headers=false&amp;gid=1569103012" style="display: block; width: 100%; height: 100%;"></iframe>

So actually you need to retrieve the data from that Google Spreadsheet document. That could be done with XHR and Regex, as shown in the below code:

Option Explicit

Sub Test()

    Dim sContent As String
    Dim i As Long
    Dim j As Long
    Dim k As Long
    Dim aTables()
    Dim aHeader() As String
    Dim aRows() As String

    ' Retrieve HTML content via XHR
    With CreateObject("MSXML2.XMLHTTP")
        .Open "GET", "https://www.rosterresource.com/mlb-arizona-diamondbacks", False
        .Send
        sContent = .ResponseText
    End With
    ' Cut all before iframe URL
    sContent = Split(sContent, "<iframe src=""", 2)(1)
    ' Cut all after ? sign within URL
    sContent = Split(sContent, "?", 2)(0)
    ' Download google spreadsheet by extracted URL
    ' e. g. https://docs.google.com/spreadsheets/d/e/2PACX-1vQngsjnOpqkD8FQIOLn4cFayZTe4dl5VJZLNjMzji2Iq0dVXan7nj20Pq6oKnVS_HFla9e5GUtCyYl_/pubhtml
    ' e. g. https://docs.google.com/spreadsheets/d/e/2PACX-1vSe6YBd7UW_ijhVHdRsM132Z3aUXUIzGuHcoqqdsr_nUXIYHbvRDFY0XCwGVndXJnWRaWVYhbeDbo5W/pubhtml
    ' Retrieve HTML content via XHR
    With CreateObject("MSXML2.XMLHTTP")
        .Open "GET", sContent, False
        .Send
        sContent = .ResponseText
    End With
    ' Parse with RegEx
    With CreateObject("VBScript.RegExp")
        .Global = True
        .MultiLine = True
        .IgnoreCase = True
        ' Process all tables within iframe content
        .Pattern = "<table\b[\s\S]*?>([\s\S]*?)</table>"
        With .Execute(sContent)
            ReDim aTables(0 To .Count - 1)
            For i = 0 To .Count - 1
                aTables(i) = .Item(i).SubMatches(0)
            Next
        End With
        For k = 0 To UBound(aTables)
            ' Minor HTML simplification
            sContent = aTables(k)
            ' Remove all tags except table formatting
            .Pattern = "<(?!/td|/tr|/th|(?:td|tr|th)\b)[^>]*>|\r|\n|\t"
            sContent = .Replace(sContent, "")
            ' Remove tags attributes
            .Pattern = "<(\w+)\b[^>]+>"
            sContent = .Replace(sContent, "<$1>")
            ' Replace th with td
            .Pattern = "<(/?)th>"
            sContent = .Replace(sContent, "<$1td>")
            ' Replace HTML entities &name; &#number; with chars
            .Pattern = "&(?:\w+|#\d+);"
            .Global = False
            Do
                With .Execute(sContent)
                    If .Count = 0 Then Exit Do
                    sContent = Replace(sContent, .Item(0), DecodeHTMLEntities(.Item(0)))
                End With
            Loop
            .Global = True
           ' Extract rows
            .Pattern = "<tr>((?:<td>.*?</td>)+)</tr>"
            With .Execute(sContent)
                ReDim aRows(0 To .Count - 1, 0)
                For i = 0 To .Count - 1
                    aRows(i, 0) = .Item(i).SubMatches(0)
                Next
            End With
            ' Extract cells
            .Pattern = "<td>(.*?)</td>"
            For i = 0 To UBound(aRows, 1)
                With .Execute(aRows(i, 0))
                    For j = 0 To .Count - 1
                        If UBound(aRows, 2) < j Then ReDim Preserve aRows(UBound(aRows, 1), j)
                        aRows(i, j) = Trim(.Item(j).SubMatches(0))
                        DoEvents
                    Next
                End With
            Next
            aTables(k) = aRows
        Next
    End With
    ' Output
    With ThisWorkbook
        ' Remove all existing worksheets
        Application.DisplayAlerts = False
        .Sheets.Add , .Sheets(.Sheets.Count)
        Do While .Sheets.Count > 1
            .Sheets(1).Delete
        Loop
        Application.DisplayAlerts = True
        ' Output each table to separate worksheet
        For k = 0 To UBound(aTables)
            If .Sheets.Count < (k + 1) Then .Sheets.Add , .Sheets(.Sheets.Count)
            With .Sheets(k + 1)
                .Cells.Delete
                Output2DArray .Cells(1, 1), aTables(k)
                .Columns.AutoFit
            End With
        Next
    End With

End Sub

Function DecodeHTMLEntities(sText As String) As String

    Static oHtmlfile As Object
    Static oDiv As Object

    If oHtmlfile Is Nothing Then
        Set oHtmlfile = CreateObject("htmlfile")
        oHtmlfile.Open
        Set oDiv = oHtmlfile.createElement("div")
    End If
    oDiv.innerHTML = sText
    DecodeHTMLEntities = oDiv.innerText

End Function

Sub Output2DArray(oDstRng As Range, aCells As Variant)

    With oDstRng
        .Parent.Select
        With .Resize( _
                UBound(aCells, 1) - LBound(aCells, 1) + 1, _
                UBound(aCells, 2) - LBound(aCells, 2) + 1)
            .NumberFormat = "@"
            .Value = aCells
        End With
    End With

End Sub

Generally RegEx's aren't recommended for HTML parsing, so there is disclaimer. Data being processed in this case is quite simple that is why it is parsed with RegEx. About RegEx: introduction (especially syntax), introduction JS, VB flavor. Simplification makes HTML code suitable for parsing in some degree. BTW there is one more answer using the same approach.