在VBA中解析HTML内容

Tde*_*dev 12 parsing vba web-crawler excel-vba html-parsing

我有一个与HTML解析有关的问题.我有一个包含一些产品的网站,我想将页面内的文字记录到我当前的电子表格中.这个电子表格很大但在第3列中包含ItemNbr,我希望第14列和第1行中的文本对应于一个产品(item).

我的想法是在标签后面的Innertext里面的网页上获取"材料".身份证号码从一页变为另一页(有时).

这是网站的结构:

<div style="position:relative;">
    <div></div>
    <table id="list-table" width="100%" tabindex="1" cellspacing="0" cellpadding="0" border="0" role="grid" aria-multiselectable="false" aria-labelledby="gbox_list-table" class="ui-jqgrid-btable" style="width: 930px;">
        <tbody>
            <tr class="jqgfirstrow" role="row" style="height:auto">
                <td ...</td>
                <td ...</td>
            </tr>
            <tr role="row" id="1" tabindex="-1" class="ui-widget-content jqgrow ui-row-ltr">
                <td ...</td>
                <td ...</td>
            </tr>
            <tr role="row" id="2" tabindex="-1" class="ui-widget-content jqgrow ui-row-ltr">
                <td ...</td>
                <td ...</td>
            </tr>
            <tr role="row" id="3" tabindex="-1" class="ui-widget-content jqgrow ui-row-ltr">
                <td ...</td>
                <td ...</td>
            </tr>
            <tr role="row" id="4" tabindex="-1" class="ui-widget-content jqgrow ui-row-ltr">
                <td ...</td>
                <td ...</td>
            </tr>
            <tr role="row" id="5" tabindex="-1" class="ui-widget-content jqgrow ui-row-ltr">
                <td ...</td>
                <td ...</td>
            </tr>
            <tr role="row" id="6" tabindex="-1" class="ui-widget-content jqgrow ui-row-ltr">
                <td ...</td>
                <td ...</td>
            </tr>
            <tr role="row" id="7" tabindex="-1" class="ui-widget-content jqgrow ui-row-ltr">
                <td role="gridcell" style="padding-left:10px" title="Material" aria-describedby="list-table_">Material</td>
                <td role="gridcell" style="" title="600D polyester." aria-describedby="list-table_">600D polyester.</td>
            </tr>           
            <tr ...>
            </tr>
        </tbody>
    </table> </div>
Run Code Online (Sandbox Code Playgroud)

我希望得到"600D涤纶".

我的(不工作)代码片段是这样的:

Sub ParseMaterial()

    Dim Cell As Integer
    Dim ItemNbr As String

    Dim AElement As Object
    Dim AElements As IHTMLElementCollection
Dim IE As MSXML2.XMLHTTP60
Set IE = New MSXML2.XMLHTTP60

Dim HTMLDoc As MSHTML.HTMLDocument
Dim HTMLBody As MSHTML.HTMLBody

Set HTMLDoc = New MSHTML.HTMLDocument
Set HTMLBody = HTMLDoc.body

For Cell = 1 To 5                            'I iterate through the file row by row

    ItemNbr = Cells(Cell, 3).Value           'ItemNbr isin the 3rd Column of my spreadsheet

    IE.Open "GET", "http://www.example.com/?item=" & ItemNbr, False
    IE.send

    While IE.ReadyState <> 4
        DoEvents
    Wend

    HTMLBody.innerHTML = IE.responseText

    Set AElements = HTMLDoc.getElementById("list-table").getElementsByTagName("tr")
    For Each AElement In AElements
        If AElement.Title = "Material" Then
            Cells(Cell, 14) = AElement.nextNode.value     'I write the material in the 14th column
        End If
    Next AElement

        Application.Wait (Now + TimeValue("0:00:2"))

Next Cell
Run Code Online (Sandbox Code Playgroud)

谢谢你的帮助 !

IAm*_*ged 9

只有几件事情可以帮助您朝着正确的方向前进:

  • 清理一下:删除readystate属性测试循环.readystate属性返回的值在此上下文中永远不会更改 - 代码将在发送指令后暂停,仅在收到服务器响应后恢复,或者未能执行此操作.将相应地设置readystate属性,代码将继续执行.您仍然应该测试就绪状态,但循环是不必要的

  • 定位正确的HTML元素:您正在搜索tr元素 - 而您在代码中如何使用这些元素的逻辑实际上看起来指向td元素

  • 确保这些属性实际可用于您正在使用它们的对象:为了帮助您,请尝试将所有变量声明为特定对象而不是通用对象.这将激活intellisense.如果您很难找到相关库中定义的对象的实际名称,请将其声明为通用对象,运行代码,然后检查对象的类型 - 通过打印typename(your_object)例如,到调试窗口.这应该会让你走上正轨

我还在下面提供了一些可能有用的代码.如果你仍然无法让这个工作,你可以分享你的网址 - 请这样做.

Sub getInfoWeb()

    Dim cell As Integer
    Dim xhr As MSXML2.XMLHTTP60
    Dim doc As MSHTML.HTMLDocument
    Dim table As MSHTML.HTMLTable
    Dim tableCells As MSHTML.IHTMLElementCollection

    Set xhr = New MSXML2.XMLHTTP60

    For cell = 1 To 5

        ItemNbr = Cells(cell, 3).Value

        With xhr

            .Open "GET", "http://www.example.com/?item=" & ItemNbr, False
            .send

            If .readyState = 4 And .Status = 200 Then
                Set doc = New MSHTML.HTMLDocument
                doc.body.innerHTML = .responseText
            Else
                MsgBox "Error" & vbNewLine & "Ready state: " & .readyState & _
                vbNewLine & "HTTP request status: " & .Status
            End If

        End With

        Set table = doc.getElementById("list-table")
        Set tableCells = table.getElementsByTagName("td")

        For Each tableCell In tableCells
            If tableCell.getAttribute("title") = "Material" Then
                Cells(cell, 14).Value = tableCell.NextSibling.innerHTML
            End If
        Next tableCell

    Next cell

End Sub
Run Code Online (Sandbox Code Playgroud)

编辑:作为您在下面评论中提供的进一步信息的后续行动 - 以及我添加的附加评论

'Determine your product number
    'Open an xhr for your source url, and retrieve the product number from there - search for the tag which
    'text include the "productnummer:" substring, and extract the product number from the outerstring
    'OR
    'if the product number consistently consists of the fctkeywords you are entering in your source url
    'with two "0" appended - just build the product number like that
'Open an new xhr for this url "http://www.pfconcept.com/cgi-bin/wspd_pcdb_cgi.sh/y/y2productspec-ajax.p?itemc=" & product_number & "&_search=false&rows=-1&page=1&sidx=&sord=asc"
'Load the response in an XML document, and retrieve the material information

Sub getInfoWeb()

    Dim xhr As MSXML2.XMLHTTP60
    Dim doc As MSXML2.DOMDocument60
    Dim xmlCell As MSXML2.IXMLDOMElement
    Dim xmlCells As MSXML2.IXMLDOMNodeList
    Dim materialValueElement As MSXML2.IXMLDOMElement

    Set xhr = New MSXML2.XMLHTTP60

        With xhr

            .Open "GET", "http://www.pfconcept.com/cgi-bin/wspd_pcdb_cgi.sh/y/y2productspec-ajax.p?itemc=10031700&_search=false&rows=-1&page=1&sidx=&sord=asc", False
            .send

            If .readyState = 4 And .Status = 200 Then
                Set doc = New MSXML2.DOMDocument60
                doc.LoadXML .responseText
            Else
                MsgBox "Error" & vbNewLine & "Ready state: " & .readyState & _
                vbNewLine & "HTTP request status: " & .Status
            End If

        End With

        Set xmlCells = doc.getElementsByTagName("cell")

        For Each xmlCell In xmlCells
            If xmlCell.Text = "Materiaal" Then
                Set materialValueElement = xmlCell.NextSibling
            End If
        Next

        MsgBox materialValueElement.Text

End Sub
Run Code Online (Sandbox Code Playgroud)

EDIT2:IE的另一种自动化

Sub searchWebViaIE()
    Dim ie As SHDocVw.InternetExplorer
    Dim doc As MSHTML.HTMLDocument
    Dim anchors As MSHTML.IHTMLElementCollection
    Dim anchor As MSHTML.HTMLAnchorElement
    Dim prodSpec As MSHTML.HTMLAnchorElement
    Dim tableCells As MSHTML.IHTMLElementCollection
    Dim materialValueElement As MSHTML.HTMLTableCell
    Dim tableCell As MSHTML.HTMLTableCell

    Set ie = New SHDocVw.InternetExplorer

    With ie
        .navigate "http://www.pfconcept.com/cgi-bin/wspd_pcdb_cgi.sh/y/y2facetmain.p?fctkeywords=100317&world=general#tabs-4"
        .Visible = True

        Do While .readyState <> READYSTATE_COMPLETE Or .Busy = True
            DoEvents
        Loop

        Set doc = .document

        Set anchors = doc.getElementsByTagName("a")

        For Each anchor In anchors
            If InStr(anchor.innerHTML, "Product Specificatie") <> 0 Then
                anchor.Click
                Exit For
            End If
        Next anchor

        Do While .readyState <> READYSTATE_COMPLETE Or .Busy = True
            DoEvents
        Loop

    End With

    For Each anchor In anchors
        If InStr(anchor.innerHTML, "Product Specificatie") <> 0 Then
            Set prodSpec = anchor
        End If
    Next anchor

    Set tableCells = doc.getElementById("list-table").getElementsByTagName("td")

    If Not tableCells Is Nothing Then
        For Each tableCell In tableCells
            If tableCell.innerHTML = "Materiaal" Then
                Set materialValueElement = tableCell.NextSibling
            End If
        Next tableCell
    End If

    MsgBox materialValueElement.innerHTML

End Sub
Run Code Online (Sandbox Code Playgroud)